Download as pdf or txt
Download as pdf or txt
You are on page 1of 8

Group-6_Assignment-1

September 24, 2022

[8]: # Group 6 Assignment 1 - EDA


# Group Members
# Yash Agasty - N20212216
# Uditya Narayan Pal - N20212216
# Yash Mehta - M20212711
# Suvashish Pandey - N20211077
# Abhishek Chakraborty - N20212201
# Ritesh Kumar - N20212324

[4]: #Load the required libraries


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

[5]: #Load the data


df = pd.read_csv('C:\\Users\\yasha\\Desktop\\New folder␣
,→(3)\\IDBA\\Cuisine_rating.csv')

[6]: df.head()

[6]: User ID Area code Location Gender YOB Marital Status \


0 1 153 Upper East Side,NY Female 2006 Single
1 2 123 St. George,NY Female 1991 Married
2 3 122 Upper West Side,NY Male 1977 Single
3 4 153 Upper East Side,NY Female 1956 Married
4 5 129 Central Park,NY Male 1997 Single

Activity Budget Cuisines Alcohol Smoker Food Rating \


0 Professional 3 Japanese Never Never 5
1 Student 3 Indian Never Socially 1
2 Student 5 Seafood Often Often 5
3 Professional 5 Japanese Never Socially 3
4 Student 4 Filipino Socially Never 2

Service Rating Overall Rating Often A S


0 4 4.5 No

1
1 1 1.0 No
2 5 5.0 Yes
3 1 2.0 No
4 4 3.0 No

[7]: df.tail()

[7]: User ID Area code Location Gender YOB Marital Status \


195 196 175 St. George,NY Female 1982 Single
196 197 170 Upper West Side,NY Female 2000 Married
197 198 160 St. George,NY Female 2006 Single
198 199 130 St. George,NY Male 2002 Married
199 200 140 Upper East Side,NY Male 2005 Married

Activity Budget Cuisines Alcohol Smoker Food Rating \


195 Professional 4 French Never Socially 1
196 Student 4 Chinese Never Often 1
197 Professional 5 Japanese Never Often 5
198 Student 3 Filipino Never Socially 3
199 Student 4 French Never Never 3

Service Rating Overall Rating Often A S


195 2 1.5 No
196 2 1.5 No
197 2 3.5 No
198 2 2.5 No
199 2 2.5 No

[9]: df.sample(10)

[9]: User ID Area code Location Gender YOB Marital Status \


154 155 136 Riverdale,NY Female 1958 Single
134 135 172 Central Park,NY Male 2001 Divorced
105 106 172 China Town, NY Male 1995 Single
193 194 103 Riverdale,NY Female 1985 Married
94 95 111 China Town, NY Male 1974 Married
9 10 129 Central Park,NY Male 1995 Single
49 50 107 Riverdale,NY Female 1974 Married
153 154 138 St. George,NY Male 1989 Single
112 113 101 St. George,NY Female 2006 Single
172 173 184 St. George,NY Male 1962 Married

Activity Budget Cuisines Alcohol Smoker Food Rating \


154 Professional 5 Filipino Socially Socially 5
134 Student 1 Japanese Socially Often 4
105 Student 5 Seafood Never Often 4
193 Student 5 Filipino Often Never 1

2
94 Student 3 Indian Often Often 2
9 Student 4 Chinese Often Often 5
49 Student 5 Japanese Socially Socially 2
153 Professional 5 Filipino Socially Often 4
112 Professional 5 Chinese Socially Never 5
172 Professional 4 Indian Never Never 1

Service Rating Overall Rating Often A S


154 3 4.0 No
134 5 4.5 No
105 3 3.5 No
193 4 2.5 No
94 5 3.5 Yes
9 2 3.5 Yes
49 5 3.5 No
153 4 4.0 No
112 4 4.5 No
172 1 1.0 No

[10]: df.columns

[10]: Index(['User ID', 'Area code', 'Location', 'Gender', 'YOB', 'Marital Status',
'Activity', 'Budget', 'Cuisines', 'Alcohol ', 'Smoker', 'Food Rating',
'Service Rating', 'Overall Rating', 'Often A S'],
dtype='object')

[ ]:

[12]: #Basic information

df.info()

#Describe the data

df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User ID 200 non-null int64
1 Area code 200 non-null int64
2 Location 200 non-null object
3 Gender 200 non-null object
4 YOB 200 non-null int64
5 Marital Status 200 non-null object
6 Activity 200 non-null object

3
7 Budget 200 non-null int64
8 Cuisines 200 non-null object
9 Alcohol 200 non-null object
10 Smoker 200 non-null object
11 Food Rating 200 non-null int64
12 Service Rating 200 non-null int64
13 Overall Rating 200 non-null float64
14 Often A S 200 non-null object
dtypes: float64(1), int64(6), object(8)
memory usage: 23.6+ KB

[12]: User ID Area code YOB Budget Food Rating \


count 200.000000 200.000000 200.000000 200.000000 200.000000
mean 100.500000 141.060000 1984.830000 3.815000 3.220000
std 57.879185 26.130257 16.809339 1.056578 1.411226
min 1.000000 101.000000 1955.000000 1.000000 1.000000
25% 50.750000 123.000000 1971.000000 3.000000 2.000000
50% 100.500000 135.000000 1987.000000 4.000000 3.000000
75% 150.250000 158.000000 2000.000000 5.000000 5.000000
max 200.000000 199.000000 2009.000000 5.000000 5.000000

Service Rating Overall Rating


count 200.000000 200.000000
mean 3.230000 3.225000
std 1.526022 1.079445
min 1.000000 1.000000
25% 2.000000 2.500000
50% 3.000000 3.000000
75% 5.000000 4.000000
max 5.000000 5.000000

[13]: #Find the duplicates

df.duplicated().sum()

[13]: 0

[15]: #Datatypes

df.dtypes

[15]: User ID int64


Area code int64
Location object
Gender object
YOB int64
Marital Status object
Activity object

4
Budget int64
Cuisines object
Alcohol object
Smoker object
Food Rating int64
Service Rating int64
Overall Rating float64
Often A S object
dtype: object

[16]: df.Cuisines.value_counts()

[16]: Japanese 36
French 34
Filipino 34
Indian 32
Chinese 24
Seafood 22
Italian 18
Name: Cuisines, dtype: int64

[17]: df.Cuisines.value_counts().plot(kind="bar")
plt.title("Cuisines")
plt.xlabel("Cuisine type")
plt.xticks(rotation=0)
plt.ylabel("Counts")
plt.show()

5
[18]: #Find null values

df.isnull().sum()

[18]: User ID 0
Area code 0
Location 0
Gender 0
YOB 0
Marital Status 0
Activity 0
Budget 0
Cuisines 0
Alcohol 0
Smoker 0
Food Rating 0
Service Rating 0
Overall Rating 0
Often A S 0
dtype: int64

[19]: sns.catplot(x="Cuisines", y="Food Rating", data=df, kind="box", aspect=1.5)


plt.title("Boxplot for Cusines vs Rating")

6
plt.show()

[20]: print(f"Skewness: {df['Overall Rating'].skew()}")


print(f"Kurtosis: {df['Overall Rating'].kurt()}")

Skewness: 0.05531324050499118
Kurtosis: -0.7037624403662659

[21]: #Correlation

df.corr()

[21]: User ID Area code YOB Budget Food Rating \


User ID 1.000000 0.463977 0.006203 -0.010148 -0.003691
Area code 0.463977 1.000000 -0.065006 -0.046191 0.000458
YOB 0.006203 -0.065006 1.000000 -0.071383 0.040774
Budget -0.010148 -0.046191 -0.071383 1.000000 0.057764
Food Rating -0.003691 0.000458 0.040774 0.057764 1.000000
Service Rating 0.111227 -0.011942 0.043651 -0.135542 0.079056
Overall Rating 0.076208 -0.008142 0.057508 -0.058049 0.709562

Service Rating Overall Rating

7
User ID 0.111227 0.076208
Area code -0.011942 -0.008142
YOB 0.043651 0.057508
Budget -0.135542 -0.058049
Food Rating 0.079056 0.709562
Service Rating 1.000000 0.758532
Overall Rating 0.758532 1.000000

[22]: #Correlation plot

sns.heatmap(df.corr())

[22]: <AxesSubplot:>

[ ]:

You might also like