Professional Documents
Culture Documents
EDA of Hotel Booking Dataset - Kaggle
EDA of Hotel Booking Dataset - Kaggle
EDA of Hotel Booking Dataset - Kaggle
Out[2]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_num
Resort
0 0 342 2015 July
Hotel
Resort
1 0 737 2015 July
Hotel
Resort
2 0 7 2015 July
Hotel
Resort
3 0 13 2015 July
Hotel
Resort
4 0 14 2015 July
Hotel
City
119385 0 23 2017 August
Hotel
City
119386 0 102 2017 August
Hotel
City
119387 0 34 2017 August
Hotel
City
119388 0 109 2017 August
Hotel
City
119389 0 205 2017 August
Hotel
Out[3]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number
Resort
0 0 342 2015 July 27
Hotel
Resort
1 0 737 2015 July 27
Hotel
Resort
2 0 7 2015 July 27
Hotel
Resort
3 0 13 2015 July 27
Hotel
Resort
4 0 14 2015 July 27
Hotel
5 rows × 32 columns
Out[4]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_numb
City
119385 0 23 2017 August
Hotel
City
119386 0 102 2017 August
Hotel
City
119387 0 34 2017 August
Hotel
City
119388 0 109 2017 August
Hotel
City
119389 0 205 2017 August
Hotel
5 rows × 32 columns
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
Out[6]:
is_canceled lead_time arrival_date_year arrival_date_week_number arrival_date_day_
<class 'pandas.core.frame.DataFrame'>
is_canceled int64
lead_time int64
arrival_date_year int64
arrival_date_month object
arrival_date_week_number int64
arrival_date_day_of_month int64
stays_in_weekend_nights int64
stays_in_week_nights int64
adults int64
children float64
babies int64
meal object
country object
market_segment object
distribution_channel object
is_repeated_guest int64
previous_cancellations int64
previous_bookings_not_canceled int64
reserved_room_type object
assigned_room_type object
booking_changes int64
deposit_type object
agent float64
company float64
days_in_waiting_list int64
customer_type object
adr float64
required_car_parking_spaces int64
total_of_special_requests int64
reservation_status object
reservation_status_date object
dtype: object
Out[10]: hotel 0
is_canceled 0
lead_time 0
arrival_date_year 0
arrival_date_month 0
arrival_date_week_number 0
arrival_date_day_of_month 0
stays_in_weekend_nights 0
stays_in_week_nights 0
adults 0
children 4
babies 0
meal 0
country 488
market_segment 0
distribution_channel 0
is_repeated_guest 0
previous_cancellations 0
previous_bookings_not_canceled 0
reserved_room_type 0
assigned_room_type 0
booking_changes 0
deposit_type 0
agent 16340
company 112593
days_in_waiting_list 0
customer_type 0
adr 0
required_car_parking_spaces 0
total_of_special_requests 0
reservation_status 0
reservation_status_date 0
dtype: int64
In [11]: df.head()
Out[11]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number
Resort
0 0 342 2015 July 27
Hotel
Resort
1 0 737 2015 July 27
Hotel
Resort
2 0 7 2015 July 27
Hotel
Resort
3 0 13 2015 July 27
Hotel
Resort
4 0 14 2015 July 27
Hotel
5 rows × 32 columns
Droping 'company' column because it contains a large
amount of null values
In [12]: df = df.drop('company', axis=1)
In [13]: df.shape
In [14]: df.columns
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
In [16]: df['agent']
Out[16]: 0 0.0
1 0.0
2 0.0
3 304.0
4 240.0
...
119385 394.0
119386 9.0
119387 9.0
119388 89.0
119389 9.0
GBR 12129
FRA 10415
ESP 8568
DEU 7287
...
ASM 1
BWA 1
HND 1
MLI 1
BHS 1
In [19]: df['country'].value_counts()
GBR 12129
FRA 10415
ESP 8568
DEU 7287
...
ASM 1
BWA 1
HND 1
MLI 1
BHS 1
Out[20]: hotel 0
is_canceled 0
lead_time 0
arrival_date_year 0
arrival_date_month 0
arrival_date_week_number 0
arrival_date_day_of_month 0
stays_in_weekend_nights 0
stays_in_week_nights 0
adults 0
children 4
babies 0
meal 0
country 0
market_segment 0
distribution_channel 0
is_repeated_guest 0
previous_cancellations 0
previous_bookings_not_canceled 0
reserved_room_type 0
assigned_room_type 0
booking_changes 0
deposit_type 0
agent 0
days_in_waiting_list 0
customer_type 0
adr 0
required_car_parking_spaces 0
total_of_special_requests 0
reservation_status 0
reservation_status_date 0
dtype: int64
Out[21]:
is_canceled lead_time arrival_date_year arrival_date_week_numbe
Out[22]: 32020
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
Out[29]: 87370
Out[31]: arrival_date_month
August 11257
July 10055
May 8354
April 7905
June 7765
March 7510
October 6932
September 6689
February 6091
December 5128
November 4993
January 4691
Out[33]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number
Resort
0 0 342 2015 July 27
Hotel
Resort
1 0 737 2015 July 27
Hotel
Resort
2 0 7 2015 July 27
Hotel
Resort
3 0 13 2015 July 27
Hotel
Resort
4 0 14 2015 July 27
Hotel
5 rows × 31 columns
Out[34]: arrival_date_week_number
1 862
2 945
3 1049
4 1124
5 1101
6 1295
7 1629
8 1523
9 1579
10 1630
11 1657
12 1572
13 1817
14 1693
15 1989
16 1736
17 1878
18 2087
19 1813
20 1843
21 2043
22 1753
23 1872
24 1746
25 1786
26 1739
27 2166
28 2343
29 2197
30 2335
31 2286
32 2449
33 2793
34 2491
35 2105
36 1626
37 1474
38 1634
39 1590
40 1427
41 1663
42 1445
43 1605
44 1548
45 1314
46 1141
47 1288
48 1199
49 1169
50 1053
51 785
52 1061
53 1422
In [35]: df.groupby(['hotel'])['lead_time'].count().sort_values(ascending=False)
Out[35]: hotel
In [36]: df.groupby(['arrival_date_day_of_month'])['arrival_date_month'].count().sort_valu
Out[36]: arrival_date_day_of_month
17 3018
2 3015
26 3000
5 2980
16 2957
19 2949
12 2928
28 2924
18 2923
11 2915
20 2915
27 2902
29 2880
9 2878
15 2868
25 2837
3 2833
21 2822
13 2812
8 2808
6 2804
4 2798
10 2785
23 2776
24 2774
30 2770
1 2769
7 2704
14 2692
22 2601
31 1733
Out[38]: arrival_date_day_of_month
17 3018
2 3015
26 3000
5 2980
16 2957
19 2949
12 2928
28 2924
18 2923
11 2915
20 2915
27 2902
29 2880
9 2878
15 2868
25 2837
3 2833
21 2822
13 2812
8 2808
6 2804
4 2798
10 2785
23 2776
24 2774
30 2770
1 2769
7 2704
14 2692
22 2601
31 1733
In [40]: df.columns
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
Out[41]: 87370
BB 67955
SC 9481
HB 9084
Undefined 490
FB 360
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
Out[46]: hotel
In [47]:
sns.scatterplot( data=df, x="hotel", y="adults", hue='meal')
plt.grid()
plt.show()
In [48]:
sns.scatterplot(data=df, x='hotel', y='children', hue='meal')
plt.grid()
plt.show()
Insights:
1. Adult people prefer to come to Resort hotel and mostly order bb meal.
2. Most of the Childrens also prefer Resort hotel and mostly order bb meal.
3. The family with the babies prefer city hotel and the meal that is bb.
In [50]: df.columns
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
In [51]: df['market_segment'].value_counts()
Direct 11798
Groups 4940
Corporate 4202
Complementary 702
Aviation 227
Undefined 2
Out[54]: hotel
Insight
Most of the people prefer to stay weekend night in City Hotel.
Out[56]: 87370
In [57]: df.groupby(['hotel'])['stays_in_week_nights'].count().sort_values(ascending=False
Out[57]: hotel
Insight
Most of the people prefer to stay week in night in City Hotel.
Out[60]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
In [61]: df.groupby(['market_segment','hotel']).count()
Out[61]:
is_canceled lead_time arrival_date_year arrival_date_month arrival_date_
market_segment hotel
Aviation City
227 227 227 227
Hotel
Complementary City
513 513 513 513
Hotel
Resort
189 189 189 189
Hotel
Corporate City
2227 2227 2227 2227
Hotel
Resort
1975 1975 1975 1975
Hotel
Direct City
5559 5559 5559 5559
Hotel
Resort
6239 6239 6239 6239
Hotel
Groups City
2635 2635 2635 2635
Hotel
Resort
2305 2305 2305 2305
Hotel
Resort
6615 6615 6615 6615
Hotel
Online TA City
34992 34992 34992 34992
Hotel
Resort
16621 16621 16621 16621
Hotel
Undefined City
2 2 2 2
Hotel
14 rows × 29 columns
In [62]: df.groupby(['market_segment'])['hotel'].count()
Out[62]: market_segment
Aviation 227
Complementary 702
Corporate 4202
Direct 11798
Groups 4940
Online TA 51613
Undefined 2
Out[64]: 0 83955
1 3415
In [66]: df.groupby(['hotel','is_repeated_guest']).count()
Out[66]:
is_canceled lead_time arrival_date_year arrival_date_month arrival_dat
hotel is_repeated_guest
4 rows × 29 columns
In [67]: df.groupby(['is_repeated_guest','hotel']).count()
Out[67]:
is_canceled lead_time arrival_date_year arrival_date_month arrival_dat
is_repeated_guest hotel
0 City
51718 51718 51718 51718
Hotel
Resort
32237 32237 32237 32237
Hotel
1 City
1708 1708 1708 1708
Hotel
Resort
1707 1707 1707 1707
Hotel
4 rows × 29 columns
In [68]: list(df.columns)
Out[68]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
In [69]: df['distribution_channel'].value_counts()
Direct 12980
Corporate 5071
GDS 181
Undefined 5
In [72]: df['reservation_status'].value_counts()
Canceled 23010
No-Show 1014
Out[73]:
is_canceled lead_time arrival_date_year arrival_date_month arrival_dat
hotel reservation_status
6 rows × 29 columns
In [74]: sns.countplot(x='reservation_status', data=df, hue='hotel')
plt.show()
Out[76]: arrival_date_month
April 103.631776
August 150.876120
December 81.425918
February 74.731739
January 70.061422
July 135.521525
June 119.750120
March 81.624414
May 111.191058
November 72.768983
October 90.167276
September 112.081873
In [77]: list(df.columns)
Out[77]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
In [78]: #To show the adr in each month
sns.scatterplot(x=df['arrival_date_month'], y=df['adr'])
plt.xticks(rotation=90)
plt.show()
In [79]: df['reserved_room_type'].value_counts()
Out[79]: A 56530
D 17397
E 6047
F 2822
G 2052
B 999
C 915
H 596
P 6
L 6
In [81]: df['assigned_room_type'].value_counts()
Out[81]: A 46301
D 22422
E 7193
F 3625
G 2498
C 2165
B 1820
H 706
I 357
K 276
P 6
L 1
In [83]: df.groupby(['hotel'])['assigned_room_type'].value_counts()
D 13209
E 2051
F 1970
B 1661
G 691
K 276
C 161
P 4
D 9213
E 5142
C 2004
G 1807
F 1655
H 706
I 357
B 159
P 2
L 1
Out[84]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
In [85]: df.groupby(['previous_cancellations'])['hotel'].value_counts()
2 City Hotel 72
Resort Hotel 40
3 City Hotel 51
Resort Hotel 10
4 City Hotel 24
Resort Hotel 6
5 City Hotel 16
Resort Hotel 3
6 City Hotel 17
11 City Hotel 27
13 City Hotel 4
14 Resort Hotel 1
19 Resort Hotel 1
21 City Hotel 1
24 Resort Hotel 2
25 Resort Hotel 2
26 Resort Hotel 1
In [86]: df.groupby(['hotel'])['previous_cancellations'].value_counts()
1 966
2 72
3 51
11 27
4 24
6 17
5 16
13 4
21 1
1 441
2 40
3 10
4 6
5 3
24 2
25 2
14 1
19 1
26 1
In [88]: df['customer_type'].value_counts()
Transient-Party 11719
Contract 3139
Group 544
Transient 71968
Transient-Party 11719
Contract 3139
Group 544
In [90]: df['deposit_type'].value_counts()
Refundable 107
Out[93]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
Out[94]:
month Number of guest
0 August 7634
1 July 6857
2 May 5912
3 March 5680
4 April 5497
5 June 5411
6 October 5290
7 September 5047
8 February 4676
9 November 3939
10 December 3750
11 January 3653
August 7634
July 6857
May 5912
March 5680
April 5497
June 5411
October 5290
September 5047
February 4676
November 3939
December 3750
January 3653
2017 31683
2015 13310
Out[98]:
index arrival_date_year
0 2016 31169
1 2017 21571
2 2015 10606
pie chart of the number of guests who visited each year
and didnt cancelled the booking
In [99]: data = df[df['is_canceled']==0]['arrival_date_year'].value_counts()
print(data)
labels = df['arrival_date_year'].unique()
explode = (0.0, 0.1, 0.1)
plt.figure(figsize =(12, 10))
plt.pie(data, labels=labels, explode=explode, autopct='%0.2f')
plt.show()
2016 31169
2017 21571
2015 10606
0 2016 31169
1 2017 21571
2 2015 10606
In [101]: list(df.columns)
Out[101]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
Out[102]:
hotel no of guests
Out[104]:
hotel no of guests
Out[106]:
hotel no of guests
Out[109]:
country no of guest
0 PRT 9824
1 GBR 1985
2 ESP 1862
3 FRA 1733
4 ITA 1075
122 KHM 1
123 FJI 1
124 MCO 1
125 UMI 1
126 TJK 1
Out[110]:
country no of guest
0 PRT 18058
1 GBR 8447
2 FRA 7104
3 ESP 5390
4 DEU 4334
160 MAC 1
161 CYM 1
162 TJK 1
163 ZMB 1
164 PLW 1
Out[112]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
In [113]: count = df['assigned_room_type'].value_counts().reset_index()
count.columns = ['assignedroom', 'no of guest']
count
Out[113]:
assignedroom no of guest
0 A 46301
1 D 22422
2 E 7193
3 F 3625
4 G 2498
5 C 2165
6 B 1820
7 H 706
8 I 357
9 K 276
10 P 6
11 L 1
Out[114]:
assignedroom no of guest
0 A 32120
1 D 16991
2 E 5495
3 F 2731
4 C 1770
5 G 1745
6 B 1421
7 H 457
8 I 352
9 K 264
Out[115]:
assignedroom no of guest
0 A 14181
1 D 5431
2 E 1698
3 F 894
4 G 753
5 B 399
6 C 395
7 H 249
8 K 12
9 P 6
10 I 5
11 L 1
Out[118]: ['hotel',
'is_canceled',
'lead_time',
'arrival_date_year',
'arrival_date_month',
'arrival_date_week_number',
'arrival_date_day_of_month',
'stays_in_weekend_nights',
'stays_in_week_nights',
'adults',
'children',
'babies',
'meal',
'country',
'market_segment',
'distribution_channel',
'is_repeated_guest',
'previous_cancellations',
'previous_bookings_not_canceled',
'reserved_room_type',
'assigned_room_type',
'booking_changes',
'deposit_type',
'agent',
'days_in_waiting_list',
'customer_type',
'adr',
'required_car_parking_spaces',
'total_of_special_requests',
'reservation_status',
'reservation_status_date']
Total Nights
In [119]: df['stays_in_weekend_nights']+df['stays_in_week_nights']
Out[119]: 0 0
1 0
2 1
3 1
4 2
..
119385 7
119386 7
119387 7
119388 7
119389 9
In [121]: df.head()
Out[121]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number
Resort
0 0 342 2015 July 27
Hotel
Resort
1 0 737 2015 July 27
Hotel
Resort
2 0 7 2015 July 27
Hotel
Resort
3 0 13 2015 July 27
Hotel
Resort
4 0 14 2015 July 27
Hotel
5 rows × 32 columns
In [122]: df['total_nights']
Out[122]: 0 0
1 0
2 1
3 1
4 2
..
119385 7
119386 7
119387 7
119388 7
119389 9
In [123]: df.groupby(['hotel'])['total_nights'].count()
Out[123]: hotel
2 10824
1 10282
4 9620
5 4180
...
Resort Hotel 38 1
45 1
46 1
60 1
69 1