Professional Documents
Culture Documents
Uts Jesiska
Uts Jesiska
29 uts jesiska
NAMA:JESISKA
NIM:21010511
JURUSAN:SISTEM INFORMASI
SEMESTER:V
1. Data Wrangling
Out[2]:
customer_id customer_name gender age home_address zip_code city state country
Prefer not to
4 5 fulan 5 30 48 Hyatt ManorSuite 375 4032 Griffithsshire Queensland Australia
say
Out[3]:
order_id customer_id payment order_date delivery_date
Out[4]:
product_id product_type product_name size colour price quantity description
0 0 Shirt Oxford Cloth XS red 114 66 A red coloured, XS sized, Oxford Cloth Shirt
1 1 Shirt Oxford Cloth S red 114 53 A red coloured, S sized, Oxford Cloth Shirt
2 2 Shirt Oxford Cloth M red 114 54 A red coloured, M sized, Oxford Cloth Shirt
3 3 Shirt Oxford Cloth L red 114 69 A red coloured, L sized, Oxford Cloth Shirt
4 4 Shirt Oxford Cloth XL red 114 47 A red coloured, XL sized, Oxford Cloth Shirt
Out[5]:
sales_id order_id product_id price_per_unit quantity total_price
2 2 1 2 96 3 288.0
In [6]: customers_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 9 columns):
customer_id 1007 non-null int64
customer_name 1007 non-null object
gender 989 non-null object
age 1007 non-null int64
home_address 1007 non-null object
zip_code 1007 non-null int64
city 1007 non-null object
state 1007 non-null object
country 1007 non-null object
dtypes: int64(3), object(6)
memory usage: 70.9+ KB
In [7]: customers_df.isna().sum()
Out[7]: customer_id 0
customer_name 0
gender 18
age 0
home_address 0
zip_code 0
city 0
state 0
country 0
dtype: int64
Jumlah duplikasi: 6
In [9]: customers_df.describe()
Out[9]:
customer_id age zip_code
In [10]: orders_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
order_id 1000 non-null int64
customer_id 1000 non-null int64
payment 1000 non-null int64
order_date 1000 non-null object
delivery_date 1000 non-null object
dtypes: int64(3), object(2)
memory usage: 39.1+ KB
Jumlah duplikasi: 0
Out[11]:
order_id customer_id payment
In [12]: product_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 8 columns):
product_id 1266 non-null int64
product_type 1266 non-null object
product_name 1266 non-null object
size 1266 non-null object
colour 1266 non-null object
price 1266 non-null int64
quantity 1266 non-null int64
description 1266 non-null object
dtypes: int64(3), object(5)
memory usage: 79.2+ KB
product_df.describe()
Jumlah duplikasi: 6
Out[13]:
product_id price quantity
In [14]: sales_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
sales_id 5000 non-null int64
order_id 5000 non-null int64
product_id 5000 non-null int64
price_per_unit 5000 non-null int64
quantity 5000 non-null int64
total_price 4981 non-null float64
dtypes: float64(1), int64(5)
memory usage: 234.5 KB
In [15]: sales_df.isna().sum()
Out[15]: sales_id 0
order_id 0
product_id 0
price_per_unit 0
quantity 0
total_price 19
dtype: int64
Jumlah duplikasi: 0
Out[16]:
sales_id order_id product_id price_per_unit quantity total_price
In [17]: customers_df.drop_duplicates(inplace=True)
Jumlah duplikasi: 0
In [19]: customers_df[customers_df.gender.isna()]
Out[19]:
customer_id customer_name gender age home_address zip_code city state country
167 168 fulan 168 NaN 27 2781 Berge MallSuite 452 1975 North Leoburgh Western Australia Australia
322 322 fulan 322 NaN 30 593 Becker CircleApt. 333 1640 Jacobiview Western Australia Australia
393 393 fulan 393 NaN 34 5158 Levi HillSuite 531 1474 Johnsburgh Queensland Australia
442 442 fulan 442 NaN 26 5157 Feil RoadApt. 633 7249 Port Chloe New South Wales Australia
722 720 fulan 720 NaN 40 31 Jordan ParadeApt. 400 1380 West Henry South Australia Australia
09 Christopher StreetSuite
745 743 fulan 743 NaN 57 6226 Lake Lukemouth Western Australia Australia
967
Australian Capital
798 795 fulan 795 NaN 49 487 Summer MewsApt. 874 1712 East Hayden Australia
Territory
Australian Capital
801 798 fulan 798 NaN 56 27 Aiden KnollApt. 875 6531 Port Sam Australia
Territory
825 822 fulan 822 NaN 59 41 Jenkins KnollSuite 438 2588 Lake Andrewport South Australia Australia
859 855 fulan 855 NaN 55 603 O'keefe KnollSuite 782 8822 Port Dylanmouth Tasmania Australia
863 859 fulan 859 NaN 38 32 Isla GroveApt. 078 7711 Rosechester New South Wales Australia
934 929 fulan 929 NaN 68 394 Lily HillSuite 153 2353 Beahanfurt Northern Territory Australia
948 943 fulan 943 NaN 64 3117 Heller PlaceSuite 149 822 North Elijah South Australia Australia
952 946 fulan 946 NaN 24 8227 Nicholas HillSuite 150 115 South Jasper Queensland Australia
In [20]: customers_df.gender.value_counts()
In [22]: customers_df.isna().sum()
Out[22]: customer_id 0
customer_name 0
gender 0
age 0
home_address 0
zip_code 0
city 0
state 0
country 0
dtype: int64
Out[23]:
customer_id customer_name gender age home_address zip_code city state country
967 961 fulan 961 Prefer not to say 700 29 Farrell ParadeSuite 818 6528 New Joseph South Australia Australia
Out[25]:
customer_id customer_name gender age home_address zip_code city state country
215 216 fulan 216 Prefer not to say 500 038 Haley MewsApt. 810 3991 Bayertown Northern Territory Australia
In [28]: orders_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
order_id 1000 non-null int64
customer_id 1000 non-null int64
payment 1000 non-null int64
order_date 1000 non-null datetime64[ns]
delivery_date 1000 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(3)
memory usage: 39.1 KB
In [29]: product_df.drop_duplicates(inplace=True)
Jumlah duplikasi: 0
In [31]: sales_df[sales_df.total_price.isna()]
Out[31]:
sales_id order_id product_id price_per_unit quantity total_price
In [33]: sales_df.isna().sum()
Out[33]: sales_id 0
order_id 0
product_id 0
price_per_unit 0
quantity 0
total_price 0
dtype: int64
In [34]: customers_df.describe(include="all")
Out[34]:
customer_id customer_name gender age home_address zip_code city state country
count 1001.000000 1001 1001 1001.000000 1001 1001.000000 1001 1001 1001
mean 500.942058 NaN NaN 49.874126 NaN 5000.693307 NaN NaN NaN
std 289.013599 NaN NaN 17.644663 NaN 2886.084454 NaN NaN NaN
min 1.000000 NaN NaN 20.000000 NaN 2.000000 NaN NaN NaN
25% 251.000000 NaN NaN 34.000000 NaN 2398.000000 NaN NaN NaN
50% 501.000000 NaN NaN 50.000000 NaN 5079.000000 NaN NaN NaN
75% 751.000000 NaN NaN 65.000000 NaN 7454.000000 NaN NaN NaN
max 1000.000000 NaN NaN 80.000000 NaN 9998.000000 NaN NaN NaN
In [35]: customers_df.groupby(by="gender").agg({
"customer_id": "nunique",
"age": ["max", "min", "mean", "std"]
})
Out[35]:
customer_id age
gender
In [36]: customers_df.groupby(by="city").customer_id.nunique().sort_values(ascending=
False)
customers_df.groupby(by="state").customer_id.nunique().sort_values(ascending
=False)
Out[36]: state
South Australia 139
Queensland 134
New South Wales 132
Northern Territory 125
Western Australia 124
Victoria 121
Australian Capital Territory 121
Tasmania 104
Name: customer_id, dtype: int64
In [39]: orders_df.head()
Out[39]:
order_id customer_id payment order_date delivery_date delivery_time
In [40]: orders_df.describe(include="all")
Out[40]:
order_id customer_id payment order_date delivery_date delivery_time
Out[41]:
customer_id customer_name gender age home_address zip_code city state country status
In [42]: customers_df.groupby(by="status").customer_id.count()
Out[42]: status
Active 617
Non Active 384
Name: customer_id, dtype: int64
Out[43]:
order_id customer_id payment order_date delivery_date delivery_time customer_name gender age home_address zip_code
531 Schmitt
1 2 473 50490 2021-02-03 2021-02-13 10.0 fulan 473 Male 61 BoulevardApt. 1744
010
Prefer
2096 Wilson
2 3 774 46763 2021-10-08 2021-11-03 26.0 fulan 774 not to 34 8590
MewsApt. 714 J
say
Prefer
5777 Mayer
3 4 433 39782 2021-05-06 2021-05-19 13.0 fulan 433 not to 46 9728
PassApt. 881
say
Prefer 33 Richards
4 5 441 14719 2021-03-23 2021-03-24 1.0 fulan 441 not to 53 JunctionApt. 7650 So
say 478
In [44]: orders_customers_df.groupby(by="city").order_id.nunique().sort_values(ascending=False).reset_index().head(10)
Out[44]:
city order_id
0 New Ava 6
1 Jordanside 6
2 Rubyfort 5
3 O'keefeton 5
4 West Kai 5
5 Port Hannahburgh 5
6 East Max 5
7 Lake Rose 5
8 Claudiaview 4
9 Andrewborough 4
In [45]: orders_customers_df.groupby(by="state").order_id.nunique().sort_values(ascending=False)
Out[45]: state
South Australia 148
Queensland 139
Western Australia 130
New South Wales 129
Victoria 118
Australian Capital Territory 118
Tasmania 112
Northern Territory 106
Name: order_id, dtype: int64
In [47]: orders_customers_df.groupby(by="gender").order_id.nunique().sort_values(ascending=False)
Out[47]: gender
Prefer not to say 725
Female 139
Male 136
Name: order_id, dtype: int64
Out[48]: age_group
Adults 681
Seniors 226
Youth 93
Name: order_id, dtype: int64
Out[49]:
sales_id order_id product_id price_per_unit quantity total_price
Out[50]:
product_id product_type product_name size colour price quantity description
704 698 Jacket Parka L violet 119 53 A violet coloured, L sized, Parka Jacket
671 665 Jacket Parka XS red 119 65 A red coloured, XS sized, Parka Jacket
698 692 Jacket Parka M indigo 119 66 A indigo coloured, M sized, Parka Jacket
699 693 Jacket Parka L indigo 119 44 A indigo coloured, L sized, Parka Jacket
700 694 Jacket Parka XL indigo 119 78 A indigo coloured, XL sized, Parka Jacket
701 695 Jacket Parka XS violet 119 59 A violet coloured, XS sized, Parka Jacket
702 696 Jacket Parka S violet 119 67 A violet coloured, S sized, Parka Jacket
703 697 Jacket Parka M violet 119 75 A violet coloured, M sized, Parka Jacket
951 945 Trousers Slim-Fit XS red 119 72 A red coloured, XS sized, Slim-Fit Trousers
952 946 Trousers Slim-Fit S red 119 71 A red coloured, S sized, Slim-Fit Trousers
697 691 Jacket Parka S indigo 119 53 A indigo coloured, S sized, Parka Jacket
696 690 Jacket Parka XS indigo 119 57 A indigo coloured, XS sized, Parka Jacket
695 689 Jacket Parka XL blue 119 47 A blue coloured, XL sized, Parka Jacket
683 677 Jacket Parka M yellow 119 46 A yellow coloured, M sized, Parka Jacket
673 667 Jacket Parka M red 119 56 A red coloured, M sized, Parka Jacket
1209 1203 Trousers Pleated L yellow 90 62 A yellow coloured, L sized, Pleated Trousers
1208 1202 Trousers Pleated M yellow 90 54 A yellow coloured, M sized, Pleated Trousers
1207 1201 Trousers Pleated S yellow 90 71 A yellow coloured, S sized, Pleated Trousers
1206 1200 Trousers Pleated XS yellow 90 75 A yellow coloured, XS sized, Pleated Trousers
1204 1198 Trousers Pleated L orange 90 53 A orange coloured, L sized, Pleated Trousers
1229 1223 Trousers Pleated L violet 90 48 A violet coloured, L sized, Pleated Trousers
1203 1197 Trousers Pleated M orange 90 57 A orange coloured, M sized, Pleated Trousers
1202 1196 Trousers Pleated S orange 90 72 A orange coloured, S sized, Pleated Trousers
1201 1195 Trousers Pleated XS orange 90 74 A orange coloured, XS sized, Pleated Trousers
1200 1194 Trousers Pleated XL red 90 58 A red coloured, XL sized, Pleated Trousers
1199 1193 Trousers Pleated L red 90 70 A red coloured, L sized, Pleated Trousers
1198 1192 Trousers Pleated M red 90 40 A red coloured, M sized, Pleated Trousers
1212 1206 Trousers Pleated S green 90 55 A green coloured, S sized, Pleated Trousers
1213 1207 Trousers Pleated M green 90 41 A green coloured, M sized, Pleated Trousers
1214 1208 Trousers Pleated L green 90 47 A green coloured, L sized, Pleated Trousers
1215 1209 Trousers Pleated XL green 90 55 A green coloured, XL sized, Pleated Trousers
1216 1210 Trousers Pleated XS blue 90 62 A blue coloured, XS sized, Pleated Trousers
1217 1211 Trousers Pleated S blue 90 60 A blue coloured, S sized, Pleated Trousers
1218 1212 Trousers Pleated M blue 90 66 A blue coloured, M sized, Pleated Trousers
1219 1213 Trousers Pleated L blue 90 79 A blue coloured, L sized, Pleated Trousers
1220 1214 Trousers Pleated XL blue 90 55 A blue coloured, XL sized, Pleated Trousers
1221 1215 Trousers Pleated XS indigo 90 52 A indigo coloured, XS sized, Pleated Trousers
1222 1216 Trousers Pleated S indigo 90 41 A indigo coloured, S sized, Pleated Trousers
1223 1217 Trousers Pleated M indigo 90 48 A indigo coloured, M sized, Pleated Trousers
1224 1218 Trousers Pleated L indigo 90 73 A indigo coloured, L sized, Pleated Trousers
1225 1219 Trousers Pleated XL indigo 90 45 A indigo coloured, XL sized, Pleated Trousers
1226 1220 Trousers Pleated XS violet 90 58 A violet coloured, XS sized, Pleated Trousers
1227 1221 Trousers Pleated S violet 90 50 A violet coloured, S sized, Pleated Trousers
1228 1222 Trousers Pleated M violet 90 45 A violet coloured, M sized, Pleated Trousers
599 593 Jacket Bomber L violet 90 68 A violet coloured, L sized, Bomber Jacket
In [51]: product_df.groupby(by="product_type").agg({
"product_id": "nunique",
"quantity": "sum",
"price": ["min", "max"]
})
product_df.groupby(by="product_name").agg({
"product_id": "nunique",
"quantity": "sum",
"price": ["min", "max"]
})
Out[51]:
product_id quantity price
product_name
Bomber 35 2083 90 90
Cropped 35 2085 99 99
Flannel 35 2135 96 96
Henley 35 2051 92 92
High-Waisted 35 2198 98 98
Joggers 35 2107 94 94
Pleated 35 2030 90 90
Shearling 35 2169 95 95
Out[52]:
sales_id order_id product_id price_per_unit quantity_x total_price product_type product_name size colour price quantity_y d
c
1 1 1 481 118 1 118 Jacket Puffer S indigo 110 62
c
2 2 1 2 96 3 288 Shirt Oxford Cloth M red 114 54
c
3 3 1 1002 106 2 212 Trousers Wool M blue 111 52
s
c
4 4 1 691 113 3 339 Jacket Parka S indigo 119 53
In [53]: sales_product_df.groupby(by="product_type").agg({
"sales_id": "nunique",
"quantity_x": "sum",
"total_price": "sum"
})
Out[53]:
sales_id quantity_x total_price
product_type
In [54]: sales_product_df.groupby(by="product_name").agg({
"sales_id": "nunique",
"quantity_x": "sum",
"total_price": "sum"
}).sort_values(by="total_price", ascending=False)
Out[54]:
sales_id quantity_x total_price
product_name
Out[55]:
sales_id order_id product_id price_per_unit quantity_x total_price product_type product_name size colour ... customer_name
5 rows × 28 columns
Out[56]:
quantity_x total_price
state product_type
all_df.groupby(by=["age_group", "product_type"]).agg({
"quantity_x": "sum",
"total_price": "sum"
})
Out[57]:
quantity_x total_price
age_group product_type
3. Data Visualization
Out[61]:
index order_count revenue
1 2021-02 93 95080
3 2021-04 99 97530
4 2021-05 84 85597
monthly_orders_df = monthly_orders_df.reset_index()
monthly_orders_df.rename(columns={
"order_id": "order_count",
"total_price": "revenue"
}, inplace=True)
monthly_orders_df.rename(columns={"index":"order_date"}, inplace=True)
plt.figure(figsize=(10, 5))
plt.plot(monthly_orders_df["order_date"], monthly_orders_df["order_count"],
marker='o', linewidth=2, color="#72BCD4")
plt.title("Number of Orders per Month (2021)", loc="center", fontsize=20)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show();
3.2. Produk Apa yang Paling Banyak dan Paling Sedikit Terjual?
In [64]: sum_order_items_df = all_df.groupby("product_name").quantity_x.sum().sort_values(ascending=False).reset_index
sum_order_items_df.head(15)
Out[64]:
product_name quantity_x
0 Denim 527
1 Joggers 334
2 Pleated 308
4 Shearling 302
7 Puffer 298
8 Bomber 297
9 Chambray 290
11 Drawstring 288
12 Slim-Fit 288
13 Windbreaker 287
14 Cropped 284
sns.barplot(x="quantity_x", y="product_name",
data=sum_order_items_df.head(5), palette=colors, ax=ax[0])
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].set_title("Best Performing Product", loc="center", fontsize=15)
ax[0].tick_params(axis ='y', labelsize=12)
sns.barplot(x="quantity_x", y="product_name",
data=sum_order_items_df.sort_values(by="quantity_x",
ascending=True).head(5), palette=colors, ax=ax[1])
ax[1].set_ylabel(None)
ax[1].set_xlabel(None)
ax[1].invert_xaxis()
ax[1].yaxis.set_label_position("right")
ax[1].yaxis.tick_right()
ax[1].set_title("Worst Performing Product", loc="center", fontsize=15)
ax[1].tick_params(axis='y', labelsize=12)
plt.figure(figsize=(10, 5))
sns.barplot(
y="customer_count",
x="gender",
data=bygender_df.sort_values(by="customer_count",
ascending=False),
palette=colors
)
plt.title("Number of Customer by Gender", loc="center", fontsize=15)
plt.ylabel(None)
plt.xlabel(None)
plt.tick_params(axis='x', labelsize=12)
plt.show();
sns.barplot(
y="customer_count",
x="age_group",
data=byage_df.sort_values(by="age_group", ascending=False),
palette=colors_
)
plt.title("Number of Customer by Age", loc="center", fontsize=15)
plt.ylabel(None)
plt.xlabel(None)
plt.tick_params(axis='x', labelsize=12)
plt.show();
Out[70]:
customer_id frequency monetary recency
0 1 3 1641 203
1 7 1 1017 156
2 10 1 270 229
3 11 1 382 149
4 12 1 1551 127
sns.barplot(y="recency", x="customer_id",
data=rfm_df.sort_values(by="recency", ascending=True).head(5),
palette=colors, ax=ax[0])
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].set_title("By Recency (days)", loc="center", fontsize=18)
ax[0].tick_params(axis ='x', labelsize=15)
sns.barplot(y="frequency", x="customer_id",
data=rfm_df.sort_values(by="frequency", ascending=False).head(5),
palette=colors, ax=ax[1])
ax[1].set_ylabel(None)
ax[1].set_xlabel(None)
ax[1].set_title("By Frequency", loc="center", fontsize=18)
ax[1].tick_params(axis='x', labelsize=15)
sns.barplot(y="monetary", x="customer_id",
data=rfm_df.sort_values(by="monetary", ascending=False).head(5),
palette=colors, ax=ax[2])
ax[2].set_ylabel(None)
ax[2].set_xlabel(None)
ax[2].set_title("By Monetary", loc="center", fontsize=18)
ax[2].tick_params(axis='x', labelsize=15)
In [ ]: