Download as pdf or txt
Download as pdf or txt
You are on page 1of 14

SMDM Project Gopala Satish Kumar

December 12, 2021

[6]: import numpy as np


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from IPython.display import HTML
import pylab as py
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind

1 Problem 1
[8]: data1=pd.read_csv('C:/Users/kumar/Desktop/python projects/SMDM/porject/
,→Wholesale+Customers+Data.csv')

datahead=data1.head()
datahe=HTML(datahead.to_html(classes='table table-bordered'))
datahe

[8]: <IPython.core.display.HTML object>

1.1 Heatplot
[95]: corr=data1.corr()
plt.subplots(figsize=(8,8))
sns.heatmap(corr,annot=True,cmap='YlGnBu')
plt.show()

1
[86]: datadescribe=data1.describe(include='all')
datade=HTML(datadescribe.to_html(classes='table table-bordered'))
datade

[86]: <IPython.core.display.HTML object>

[63]: data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----

2
0 Buyer/Spender 440 non-null int64
1 Channel 440 non-null object
2 Region 440 non-null object
3 Fresh 440 non-null int64
4 Milk 440 non-null int64
5 Grocery 440 non-null int64
6 Frozen 440 non-null int64
7 Detergents_Paper 440 non-null int64
8 Delicatessen 440 non-null int64
dtypes: int64(7), object(2)
memory usage: 31.1+ KB

1.2 Pairplot
[97]: sns.pairplot(data1)

[97]: <seaborn.axisgrid.PairGrid at 0x1db152a1ee0>

3
1.3 1.1 Mean of all variables along with Channel and Region
[88]: datapivot=pd.pivot_table(data=data1,index=['Channel','Region'],aggfunc=np.sum)
datapi=HTML(datapivot.to_html(classes='table table-bordered'))
datapi

[88]: <IPython.core.display.HTML object>

4
1.4 1.2 Box plot
[32]: fig,axes=plt.subplots(3,2,figsize=(15,15))
sns.
,→boxplot(ax=axes[0,0],x=data1['Region'],y=data1['Fresh'],hue=data1['Channel'])

sns.boxplot(ax=axes[0,1],x=data1['Region'],y=data1['Milk'],hue=data1['Channel'])
sns.
,→boxplot(ax=axes[1,0],x=data1['Region'],y=data1['Grocery'],hue=data1['Channel'])

sns.
,→boxplot(ax=axes[1,1],x=data1['Region'],y=data1['Frozen'],hue=data1['Channel'])

sns.
,→boxplot(ax=axes[2,0],x=data1['Region'],y=data1['Detergents_Paper'],hue=data1['Channel'])

sns.
,→boxplot(ax=axes[2,1],x=data1['Region'],y=data1['Delicatessen'],hue=data1['Channel'])

[32]: <AxesSubplot:xlabel='Region', ylabel='Delicatessen'>

5
1.5 1.3 Mean and Median
[119]: round(data1.mean())

[119]: Buyer/Spender 220.0


Fresh 12000.0
Milk 5796.0
Grocery 7951.0
Frozen 3072.0
Detergents_Paper 2881.0
Delicatessen 1525.0
dtype: float64

[143]: round(data1.median())

[143]: Buyer/Spender 220.0


Fresh 8504.0
Milk 3627.0
Grocery 4756.0
Frozen 1526.0
Detergents_Paper 816.0
Delicatessen 966.0
dtype: float64

[138]: datatemp.head()

[138]: Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicatessen


0 Retail Other 12669 9656 7561 214 2674 1338
1 Retail Other 7057 9810 9568 1762 3293 1776
2 Retail Other 6353 8808 7684 2405 3516 7844
3 Hotel Other 13265 1196 4221 6404 507 1788
4 Retail Other 22615 5410 7198 3915 1777 5185

[9]: fig,axes=plt.subplots(3,2,figsize=(15,15))
datatemp=data1.drop(['Buyer/Spender'],axis=1)
sns.distplot(ax=axes[0,0],x=datatemp['Fresh'],axlabel='Fresh');
sns.distplot(ax=axes[0,1],x=datatemp['Milk'],axlabel='Milk');
sns.distplot(ax=axes[1,0],x=datatemp['Grocery'],axlabel='Grocery');
sns.distplot(ax=axes[1,1],x=datatemp['Frozen'],axlabel='Frozen');
sns.
,→distplot(ax=axes[2,0],x=datatemp['Detergents_Paper'],axlabel='Detergents_Paper');

,→

sns.distplot(ax=axes[2,1],x=datatemp['Delicatessen'],axlabel='Delicatessen');
plt.show()

C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a

6
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)

7
[76]: datatemp=data1.drop(['Buyer/Spender'],axis=1)
data_region=pd.pivot_table(data=datatemp,index=['Region']).T
data_region.to_csv('data_region.csv',index=True)
data_region

[76]: Region Lisbon Oporto Other


Delicatessen 1354.896104 1159.702128 1620.601266
Detergents_Paper 2651.116883 3687.468085 2817.753165
Fresh 11101.727273 9887.680851 12533.471519
Frozen 3000.337662 4045.361702 2944.594937
Grocery 7403.077922 9218.595745 7896.363924
Milk 5486.415584 5088.170213 5977.085443

8
2 Problem 2
2.1 Sample of dataset
[6]: data2=pd.read_csv('C:/Users/kumar/Desktop/python projects/SMDM/porject/Survey-1.
,→csv')

data2head=data2.head()
data2he=HTML(data2head.to_html(classes='table table-bordered'))
data2he

[6]: <IPython.core.display.HTML object>

2.2 Statistics variables of the data


[5]: data2describe=data2.describe(include='all')
data2de=HTML(data2describe.to_html(classes='table table-bordered'))
data2de

[5]: <IPython.core.display.HTML object>

2.3 Checking nulls and datatypes of the variables


[6]: data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 62 non-null int64
1 Gender 62 non-null object
2 Age 62 non-null int64
3 Class 62 non-null object
4 Major 62 non-null object
5 Grad Intention 62 non-null object
6 GPA 62 non-null float64
7 Employment 62 non-null object
8 Salary 62 non-null float64
9 Social Networking 62 non-null int64
10 Satisfaction 62 non-null int64
11 Spending 62 non-null int64
12 Computer 62 non-null object
13 Text Messages 62 non-null int64
dtypes: float64(2), int64(6), object(6)
memory usage: 6.9+ KB

9
2.4 2.1 Contingency table
2.4.1 2.1.1 Gender v/s Major

[12]: data2_crosstab1 = pd.crosstab(index=data2['Gender'],columns=data2['Major'],␣


,→margins=True)

data2_ct1=HTML(data2_crosstab1.to_html(classes='table table-bordered'))
data2_ct1

[12]: <IPython.core.display.HTML object>

2.4.2 2.1.2 Gender v/s Grad Intention

[7]: data2_crosstab2 = pd.crosstab(index=data2['Gender'],columns=data2['Grad␣


,→Intention'], margins=True)

data2_ct2=HTML(data2_crosstab2.to_html(classes='table table-bordered'))
data2_ct2

[7]: <IPython.core.display.HTML object>

2.4.3 2.1.3 Gender v/s Employment

[14]: data2_crosstab3 = pd.


,→crosstab(index=data2['Gender'],columns=data2['Employment'], margins=True)

data2_ct3=HTML(data2_crosstab3.to_html(classes='table table-bordered'))
data2_ct3

[14]: <IPython.core.display.HTML object>

2.4.4 2.1.4 Gender v/s Computer

[15]: data2_crosstab4 = pd.crosstab(index=data2['Gender'],columns=data2['Computer'],␣


,→margins=True)

data2_ct4=HTML(data2_crosstab4.to_html(classes='table table-bordered'))
data2_ct4

[15]: <IPython.core.display.HTML object>

2.5 2.6 Contingency table of Gender v/s Grad Intention(drop Undecided)


[8]: data2_crosstab2drop=data2_crosstab2.drop(columns='Undecided')
data2_ct2dp=HTML(data2_crosstab2drop.to_html(classes='table table-bordered'))
data2_ct2dp

[8]: <IPython.core.display.HTML object>

10
2.6 2.7
[35]: data2lessgpa=data2[data2['GPA']<3]
data2lessgpa.groupby(['Gender'])['Gender'].count()

[35]: Gender
Female 8
Male 9
Name: Gender, dtype: int64

[42]: data2moresalary=data2[data2['Salary']>50]
data2moresalary.groupby(['Gender'])['Gender'].count()

[42]: Gender
Female 13
Male 10
Name: Gender, dtype: int64

[44]: data2equalsalary=data2[data2['Salary']==50]
data2equalsalary.groupby(['Gender'])['Gender'].count()

[44]: Gender
Female 5
Male 4
Name: Gender, dtype: int64

2.7 2.8 Mean, Median and Distplot


[47]: data2.mean()

[47]: ID 31.500000
Age 21.129032
GPA 3.129032
Salary 48.548387
Social Networking 1.516129
Satisfaction 3.741935
Spending 482.016129
Text Messages 246.209677
dtype: float64

[49]: data2.median()

[49]: ID 31.50
Age 21.00
GPA 3.15
Salary 50.00
Social Networking 1.00
Satisfaction 4.00

11
Spending 500.00
Text Messages 200.00
dtype: float64

[51]: fig,axes=plt.subplots(2,2,figsize=(15,15))
sns.distplot(ax=axes[0,0],x=data2['GPA'],axlabel='GPA')
sns.distplot(ax=axes[0,1],x=data2['Salary'],axlabel='Salary')
sns.distplot(ax=axes[1,0],x=data2['Spending'],axlabel='Spending')
sns.distplot(ax=axes[1,1],x=data2['Text Messages'],axlabel='Text Messages')
plt.show()

C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
C:\Users\kumar\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)

12
3 Problem 3
3.0.1 Sample of given data

[2]: data3=pd.read_csv('C:/Users/kumar/Desktop/python projects/SMDM/porject/


,→A+&+B+shingles.csv')

data3.head()

[2]: A B
0 0.44 0.14
1 0.61 0.15
2 0.47 0.31
3 0.30 0.16
4 0.15 0.37

13
3.0.2 Nulls and datatypes of the given data

[6]: data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 36 non-null float64
1 B 31 non-null float64
dtypes: float64(2)
memory usage: 704.0 bytes

3.1 3.1 One Sample t-test


[7]: t_statistic, p_value = ttest_1samp(data3.A, 0.35)
print('One sample t test \nt statistic: {0} p value: {1} '.format(t_statistic,␣
,→p_value/2))

One sample t test


t statistic: -1.4735046253382782 p value: 0.07477633144907513

[8]: t_statistic, p_value = ttest_1samp(data3.B, 0.35,nan_policy='omit' )


print('One sample t test \nt statistic: {0} p value: {1} '.format(t_statistic,␣
,→p_value/2))

One sample t test


t statistic: -3.1003313069986995 p value: 0.0020904774003191826

[17]: print('mean of sample A',data3['A'].mean())


print('mean of sample B',data3['B'].mean())

mean of sample A 0.3166666666666666


mean of sample B 0.2735483870967742

3.2 3.2 Two Sample Independent t-test


[11]: t_statistic,p_value=ttest_ind(data3['A'],data3['B'],equal_var=True␣
,→,nan_policy='omit')

print("t_statistic={} and pvalue={}".


,→format(round(t_statistic,3),round(p_value,3)))

t_statistic=1.29 and pvalue=0.202

14

You might also like