Group-6 Assignment 1

Group-6_Assignment-1
September 24, 2022
[8]: # Group 6 Assignment 1 - EDA

# Group Members
# Yash Agasty - N20212216
# Uditya Narayan Pal - N20212216
# Yash Mehta - M20212711
# Suvashish Pandey - N20211077
# Abhishek Chakraborty - N20212201
# Ritesh Kumar - N20212324
[4]: #Load the required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
[5]: #Load the data

df = pd.read_csv('C:\\Users\\yasha\\Desktop\\New folder␣
,→(3)\\IDBA\\Cuisine_rating.csv')
[6]: df.head()
[6]: User ID Area code Location Gender YOB Marital Status \

0 1 153 Upper East Side,NY Female 2006 Single
1 2 123 St. George,NY Female 1991 Married
2 3 122 Upper West Side,NY Male 1977 Single
3 4 153 Upper East Side,NY Female 1956 Married
4 5 129 Central Park,NY Male 1997 Single
Activity Budget Cuisines Alcohol Smoker Food Rating \

0 Professional 3 Japanese Never Never 5
1 Student 3 Indian Never Socially 1
2 Student 5 Seafood Often Often 5
3 Professional 5 Japanese Never Socially 3
4 Student 4 Filipino Socially Never 2
Service Rating Overall Rating Often A S

0 4 4.5 No
1
1 1 1.0 No
2 5 5.0 Yes
3 1 2.0 No
4 4 3.0 No
[7]: df.tail()

195 196 175 St. George,NY Female 1982 Single
196 197 170 Upper West Side,NY Female 2000 Married
198 199 130 St. George,NY Male 2002 Married
199 200 140 Upper East Side,NY Male 2005 Married

195 Professional 4 French Never Socially 1
196 Student 4 Chinese Never Often 1
197 Professional 5 Japanese Never Often 5
198 Student 3 Filipino Never Socially 3
199 Student 4 French Never Never 3

195 2 1.5 No
196 2 1.5 No
197 2 3.5 No
198 2 2.5 No
199 2 2.5 No
[9]: df.sample(10)

154 155 136 Riverdale,NY Female 1958 Single
134 135 172 Central Park,NY Male 2001 Divorced
105 106 172 China Town, NY Male 1995 Single
193 194 103 Riverdale,NY Female 1985 Married
94 95 111 China Town, NY Male 1974 Married
9 10 129 Central Park,NY Male 1995 Single
49 50 107 Riverdale,NY Female 1974 Married
153 154 138 St. George,NY Male 1989 Single
172 173 184 St. George,NY Male 1962 Married

154 Professional 5 Filipino Socially Socially 5
134 Student 1 Japanese Socially Often 4
105 Student 5 Seafood Never Often 4
193 Student 5 Filipino Often Never 1
2
94 Student 3 Indian Often Often 2
9 Student 4 Chinese Often Often 5
49 Student 5 Japanese Socially Socially 2
153 Professional 5 Filipino Socially Often 4
112 Professional 5 Chinese Socially Never 5
172 Professional 4 Indian Never Never 1

154 3 4.0 No
134 5 4.5 No
105 3 3.5 No
193 4 2.5 No
94 5 3.5 Yes
9 2 3.5 Yes
49 5 3.5 No
153 4 4.0 No
112 4 4.5 No
172 1 1.0 No
[10]: df.columns
[10]: Index(['User ID', 'Area code', 'Location', 'Gender', 'YOB', 'Marital Status',
'Activity', 'Budget', 'Cuisines', 'Alcohol ', 'Smoker', 'Food Rating',
'Service Rating', 'Overall Rating', 'Often A S'],
dtype='object')
[ ]:
[12]: #Basic information
df.info()
#Describe the data
df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User ID 200 non-null int64
1 Area code 200 non-null int64
2 Location 200 non-null object
3 Gender 200 non-null object
4 YOB 200 non-null int64
5 Marital Status 200 non-null object
6 Activity 200 non-null object
3
7 Budget 200 non-null int64
8 Cuisines 200 non-null object
9 Alcohol 200 non-null object
10 Smoker 200 non-null object
11 Food Rating 200 non-null int64
12 Service Rating 200 non-null int64
13 Overall Rating 200 non-null float64
14 Often A S 200 non-null object
dtypes: float64(1), int64(6), object(8)
memory usage: 23.6+ KB
[12]: User ID Area code YOB Budget Food Rating \

count 200.000000 200.000000 200.000000 200.000000 200.000000
mean 100.500000 141.060000 1984.830000 3.815000 3.220000
std 57.879185 26.130257 16.809339 1.056578 1.411226
min 1.000000 101.000000 1955.000000 1.000000 1.000000
25% 50.750000 123.000000 1971.000000 3.000000 2.000000
50% 100.500000 135.000000 1987.000000 4.000000 3.000000
75% 150.250000 158.000000 2000.000000 5.000000 5.000000
max 200.000000 199.000000 2009.000000 5.000000 5.000000
Service Rating Overall Rating

count 200.000000 200.000000
mean 3.230000 3.225000
std 1.526022 1.079445
min 1.000000 1.000000
25% 2.000000 2.500000
50% 3.000000 3.000000
75% 5.000000 4.000000
max 5.000000 5.000000
[13]: #Find the duplicates
df.duplicated().sum()
[13]: 0
[15]: #Datatypes
df.dtypes
[15]: User ID int64

Area code int64
Location object
Gender object
YOB int64
Marital Status object
Activity object
4
Budget int64
Cuisines object
Alcohol object
Smoker object
Food Rating int64
Service Rating int64
Overall Rating float64
Often A S object
dtype: object
[16]: df.Cuisines.value_counts()
[16]: Japanese 36
French 34
Filipino 34
Indian 32
Chinese 24
Seafood 22
Italian 18
Name: Cuisines, dtype: int64
[17]: df.Cuisines.value_counts().plot(kind="bar")
plt.title("Cuisines")
plt.xlabel("Cuisine type")
plt.xticks(rotation=0)
plt.ylabel("Counts")
plt.show()
5
[18]: #Find null values
df.isnull().sum()
[18]: User ID 0
Area code 0
Location 0
Gender 0
YOB 0
Marital Status 0
Activity 0
Budget 0
Cuisines 0
Alcohol 0
Smoker 0
Food Rating 0
Service Rating 0
Overall Rating 0
Often A S 0
dtype: int64
[19]: sns.catplot(x="Cuisines", y="Food Rating", data=df, kind="box", aspect=1.5)

plt.title("Boxplot for Cusines vs Rating")
6
plt.show()
[20]: print(f"Skewness: {df['Overall Rating'].skew()}")

print(f"Kurtosis: {df['Overall Rating'].kurt()}")
Skewness: 0.05531324050499118
Kurtosis: -0.7037624403662659
[21]: #Correlation
df.corr()
[21]: User ID Area code YOB Budget Food Rating \

User ID 1.000000 0.463977 0.006203 -0.010148 -0.003691
Area code 0.463977 1.000000 -0.065006 -0.046191 0.000458
YOB 0.006203 -0.065006 1.000000 -0.071383 0.040774
Budget -0.010148 -0.046191 -0.071383 1.000000 0.057764
Food Rating -0.003691 0.000458 0.040774 0.057764 1.000000
Service Rating 0.111227 -0.011942 0.043651 -0.135542 0.079056
Overall Rating 0.076208 -0.008142 0.057508 -0.058049 0.709562
Service Rating Overall Rating
7
User ID 0.111227 0.076208
Area code -0.011942 -0.008142
YOB 0.043651 0.057508
Budget -0.135542 -0.058049
Food Rating 0.079056 0.709562
Service Rating 1.000000 0.758532
Overall Rating 0.758532 1.000000
[22]: #Correlation plot
sns.heatmap(df.corr())
[22]: <AxesSubplot:>
[ ]:

Group-6 Assignment 1

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Group-6 Assignment 1

Uploaded by

Copyright:

Available Formats

Group-6_Assignment-1

September 24, 2022

[8]: # Group 6 Assignment 1 - EDA

[4]: #Load the required libraries

[5]: #Load the data

[6]: User ID Area code Location Gender YOB Marital Status \

Activity Budget Cuisines Alcohol Smoker Food Rating \

Service Rating Overall Rating Often A S

[7]: User ID Area code Location Gender YOB Marital Status \

Activity Budget Cuisines Alcohol Smoker Food Rating \

Service Rating Overall Rating Often A S

[9]: User ID Area code Location Gender YOB Marital Status \

Activity Budget Cuisines Alcohol Smoker Food Rating \

Service Rating Overall Rating Often A S

[12]: #Basic information

#Describe the data

[12]: User ID Area code YOB Budget Food Rating \

Service Rating Overall Rating

[13]: #Find the duplicates

[15]: User ID int64

[19]: sns.catplot(x="Cuisines", y="Food Rating", data=df, kind="box", aspect=1.5)

[20]: print(f"Skewness: {df['Overall Rating'].skew()}")

[21]: User ID Area code YOB Budget Food Rating \

Service Rating Overall Rating

[22]: #Correlation plot

You might also like