Download as pdf or txt
Download as pdf or txt
You are on page 1of 25

01/11/23, 20.

29 uts jesiska

NAMA:JESISKA

NIM:21010511

JURUSAN:SISTEM INFORMASI

SEMESTER:V

UTS SISTEM INFORMASI AKUNTANSI

Impor pustaka untuk analisis data


In [72]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

1. Data Wrangling

1.1 Gathering Data

In [2]: customers_df = pd.read_csv("customers.csv")


customers_df.head()

Out[2]:
customer_id customer_name gender age home_address zip_code city state country

8606 Victoria TerraceSuite


0 1 fulan 1 Female 30 5464 Johnstonhaven Northern Territory Australia
560

Prefer not to 8327 Kirlin SummitApt.


1 2 fulan 2 69 8223 New Zacharyfort South Australia Australia
say 461

Prefer not to 269 Gemma SummitSuite Australian Capital


2 3 fulan 3 59 5661 Aliburgh Australia
say 109 Territory

Prefer not to 743 Bailey GroveSuite South


3 4 fulan 4 67 1729 Queensland Australia
say 141 Justinhaven

Prefer not to
4 5 fulan 5 30 48 Hyatt ManorSuite 375 4032 Griffithsshire Queensland Australia
say

In [3]: orders_df = pd.read_csv("orders.csv")


orders_df.head()

Out[3]:
order_id customer_id payment order_date delivery_date

0 1 64 30811 2021-8-30 2021-09-24

1 2 473 50490 2021-2-3 2021-02-13

2 3 774 46763 2021-10-8 2021-11-03

3 4 433 39782 2021-5-6 2021-05-19

4 5 441 14719 2021-3-23 2021-03-24

In [4]: product_df = pd.read_csv("products.csv")


product_df.head()

Out[4]:
product_id product_type product_name size colour price quantity description

0 0 Shirt Oxford Cloth XS red 114 66 A red coloured, XS sized, Oxford Cloth Shirt

1 1 Shirt Oxford Cloth S red 114 53 A red coloured, S sized, Oxford Cloth Shirt

2 2 Shirt Oxford Cloth M red 114 54 A red coloured, M sized, Oxford Cloth Shirt

3 3 Shirt Oxford Cloth L red 114 69 A red coloured, L sized, Oxford Cloth Shirt

4 4 Shirt Oxford Cloth XL red 114 47 A red coloured, XL sized, Oxford Cloth Shirt

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 1/25


01/11/23, 20.29 uts jesiska

In [5]: sales_df = pd.read_csv("sales.csv")


sales_df.head()

Out[5]:
sales_id order_id product_id price_per_unit quantity total_price

0 0 1 218 106 2 212.0

1 1 1 481 118 1 118.0

2 2 1 2 96 3 288.0

3 3 1 1002 106 2 212.0

4 4 1 691 113 3 339.0

1.2. Assessing Data

1.2.1 Menilai data customers_df

In [6]: customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 9 columns):
customer_id 1007 non-null int64
customer_name 1007 non-null object
gender 989 non-null object
age 1007 non-null int64
home_address 1007 non-null object
zip_code 1007 non-null int64
city 1007 non-null object
state 1007 non-null object
country 1007 non-null object
dtypes: int64(3), object(6)
memory usage: 70.9+ KB

In [7]: customers_df.isna().sum()

Out[7]: customer_id 0
customer_name 0
gender 18
age 0
home_address 0
zip_code 0
city 0
state 0
country 0
dtype: int64

In [8]: print("Jumlah duplikasi: ", customers_df.duplicated().sum())

Jumlah duplikasi: 6

In [9]: customers_df.describe()

Out[9]:
customer_id age zip_code

count 1007.000000 1007.000000 1007.000000

mean 501.726912 50.929494 5012.538232

std 288.673238 30.516299 2885.836112

min 1.000000 20.000000 2.000000

25% 252.500000 34.000000 2403.500000

50% 502.000000 50.000000 5087.000000

75% 751.500000 65.000000 7493.500000

max 1000.000000 700.000000 9998.000000

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 2/25


01/11/23, 20.29 uts jesiska

1.2.2 Menilai data orders_df

In [10]: orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
order_id 1000 non-null int64
customer_id 1000 non-null int64
payment 1000 non-null int64
order_date 1000 non-null object
delivery_date 1000 non-null object
dtypes: int64(3), object(2)
memory usage: 39.1+ KB

In [11]: print("Jumlah duplikasi: ",orders_df.duplicated().sum())



orders_df.describe()

Jumlah duplikasi: 0

Out[11]:
order_id customer_id payment

count 1000.000000 1000.000000 1000.000000

mean 500.500000 506.640000 33972.936000

std 288.819436 277.115502 14451.609047

min 1.000000 1.000000 10043.000000

25% 250.750000 275.250000 21329.250000

50% 500.500000 515.000000 33697.500000

75% 750.250000 737.250000 46249.000000

max 1000.000000 1000.000000 59910.000000

1.2.3. Menilai data product_df

In [12]: product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 8 columns):
product_id 1266 non-null int64
product_type 1266 non-null object
product_name 1266 non-null object
size 1266 non-null object
colour 1266 non-null object
price 1266 non-null int64
quantity 1266 non-null int64
description 1266 non-null object
dtypes: int64(3), object(5)
memory usage: 79.2+ KB

In [13]: print("Jumlah duplikasi: ", product_df.duplicated().sum())

product_df.describe()

Jumlah duplikasi: 6

Out[13]:
product_id price quantity

count 1266.000000 1266.000000 1266.000000

mean 627.926540 105.812006 60.138231

std 363.971586 9.715611 11.682791

min 0.000000 90.000000 40.000000

25% 313.250000 95.250000 50.000000

50% 626.500000 109.000000 60.000000

75% 942.750000 114.000000 70.000000

max 1259.000000 119.000000 80.000000

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 3/25


01/11/23, 20.29 uts jesiska

1.2.4 Menilai data sales_df

In [14]: sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
sales_id 5000 non-null int64
order_id 5000 non-null int64
product_id 5000 non-null int64
price_per_unit 5000 non-null int64
quantity 5000 non-null int64
total_price 4981 non-null float64
dtypes: float64(1), int64(5)
memory usage: 234.5 KB

In [15]: sales_df.isna().sum()

Out[15]: sales_id 0
order_id 0
product_id 0
price_per_unit 0
quantity 0
total_price 19
dtype: int64

In [16]: print("Jumlah duplikasi: ", sales_df.duplicated().sum())



sales_df.describe()

Jumlah duplikasi: 0

Out[16]:
sales_id order_id product_id price_per_unit quantity total_price

count 5000.000000 5000.000000 5000.000000 5000.000000 5000.00000 4981.000000

mean 2499.500000 503.038200 634.053200 103.501600 1.99240 206.307368

std 1443.520003 285.964418 363.255794 9.195004 0.80751 86.352449

min 0.000000 1.000000 1.000000 90.000000 1.00000 90.000000

25% 1249.750000 258.000000 323.000000 95.000000 1.00000 112.000000

50% 2499.500000 504.500000 635.000000 102.000000 2.00000 204.000000

75% 3749.250000 749.000000 951.000000 112.000000 3.00000 285.000000

max 4999.000000 999.000000 1259.000000 119.000000 3.00000 357.000000

1.3. Cleaning Data

1.3.1 Membersihkan data customers_df

In [17]: customers_df.drop_duplicates(inplace=True)

In [18]: print("Jumlah duplikasi: ", customers_df.duplicated().sum())

Jumlah duplikasi: 0

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 4/25


01/11/23, 20.29 uts jesiska

In [19]: customers_df[customers_df.gender.isna()]

Out[19]:
customer_id customer_name gender age home_address zip_code city state country

7440 Cameron Estate North


38 39 fulan 39 NaN 80 4622 Northern Territory Australia
DrSuite 628 Victoriachester

167 168 fulan 168 NaN 27 2781 Berge MallSuite 452 1975 North Leoburgh Western Australia Australia

322 322 fulan 322 NaN 30 593 Becker CircleApt. 333 1640 Jacobiview Western Australia Australia

393 393 fulan 393 NaN 34 5158 Levi HillSuite 531 1474 Johnsburgh Queensland Australia

442 442 fulan 442 NaN 26 5157 Feil RoadApt. 633 7249 Port Chloe New South Wales Australia

722 720 fulan 720 NaN 40 31 Jordan ParadeApt. 400 1380 West Henry South Australia Australia

09 Christopher StreetSuite
745 743 fulan 743 NaN 57 6226 Lake Lukemouth Western Australia Australia
967

7367 Wright JunctionApt.


773 771 fulan 771 NaN 74 8882 Kuhntown Victoria Australia
773

Australian Capital
798 795 fulan 795 NaN 49 487 Summer MewsApt. 874 1712 East Hayden Australia
Territory

Australian Capital
801 798 fulan 798 NaN 56 27 Aiden KnollApt. 875 6531 Port Sam Australia
Territory

825 822 fulan 822 NaN 59 41 Jenkins KnollSuite 438 2588 Lake Andrewport South Australia Australia

859 855 fulan 855 NaN 55 603 O'keefe KnollSuite 782 8822 Port Dylanmouth Tasmania Australia

863 859 fulan 859 NaN 38 32 Isla GroveApt. 078 7711 Rosechester New South Wales Australia

976 Murray Station StApt.


914 909 fulan 909 NaN 62 3227 Langfort Tasmania Australia
036

934 929 fulan 929 NaN 68 394 Lily HillSuite 153 2353 Beahanfurt Northern Territory Australia

948 943 fulan 943 NaN 64 3117 Heller PlaceSuite 149 822 North Elijah South Australia Australia

952 946 fulan 946 NaN 24 8227 Nicholas HillSuite 150 115 South Jasper Queensland Australia

1130 Turner Estate DrSuite


994 988 fulan 988 NaN 35 9386 New Harry Western Australia Australia
925

In [20]: customers_df.gender.value_counts()

Out[20]: Prefer not to say 725


Male 143
Female 115
Name: gender, dtype: int64

In [21]: customers_df.fillna(value="Prefer not to say", inplace=True)

In [22]: customers_df.isna().sum()

Out[22]: customer_id 0
customer_name 0
gender 0
age 0
home_address 0
zip_code 0
city 0
state 0
country 0
dtype: int64

In [23]: customers_df[customers_df.age == customers_df.age.max()]

Out[23]:
customer_id customer_name gender age home_address zip_code city state country

967 961 fulan 961 Prefer not to say 700 29 Farrell ParadeSuite 818 6528 New Joseph South Australia Australia

In [24]: customers_df.age.replace(customers_df.age.max(), 70, inplace=True)

In [25]: customers_df[customers_df.age == customers_df.age.max()]

Out[25]:
customer_id customer_name gender age home_address zip_code city state country

215 216 fulan 216 Prefer not to say 500 038 Haley MewsApt. 810 3991 Bayertown Northern Territory Australia

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 5/25


01/11/23, 20.29 uts jesiska

In [26]: customers_df.age.replace(customers_df.age.max(), 50, inplace=True)

1.3.2 Membersihkan data orders_df

In [27]: datetime_columns = ["order_date", "delivery_date"]

for column in datetime_columns:


orders_df[column] = pd.to_datetime(orders_df[column])

In [28]: orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
order_id 1000 non-null int64
customer_id 1000 non-null int64
payment 1000 non-null int64
order_date 1000 non-null datetime64[ns]
delivery_date 1000 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(3)
memory usage: 39.1 KB

1.3.3 Membersihkan data product_df

In [29]: product_df.drop_duplicates(inplace=True)

In [30]: print("Jumlah duplikasi: ", product_df.duplicated().sum())

Jumlah duplikasi: 0

1.3.4 Membersihkan data sales_df

In [31]: sales_df[sales_df.total_price.isna()]

Out[31]:
sales_id order_id product_id price_per_unit quantity total_price

9 9 2 1196 105 1 NaN

121 121 27 1027 90 3 NaN

278 278 63 360 94 2 NaN

421 421 95 1091 115 1 NaN

489 489 108 1193 105 3 NaN

539 539 117 405 119 2 NaN

636 636 134 653 93 3 NaN

687 687 145 1138 102 1 NaN

854 854 177 64 104 1 NaN

1079 1079 222 908 94 3 NaN

1193 1193 248 1121 102 2 NaN

1313 1313 272 826 117 1 NaN

1548 1548 316 103 118 3 NaN

1688 1688 345 428 107 1 NaN

1775 1775 359 694 113 2 NaN

1902 1902 381 1218 105 3 NaN

2025 2025 408 611 112 3 NaN

2164 2164 436 583 100 3 NaN

2347 2347 476 696 113 2 NaN

In [32]: sales_df["total_price"] = sales_df["price_per_unit"] * sales_df["quantity"]

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 6/25


01/11/23, 20.29 uts jesiska

In [33]: sales_df.isna().sum()

Out[33]: sales_id 0
order_id 0
product_id 0
price_per_unit 0
quantity 0
total_price 0
dtype: int64

2. Exploratory Data Analysis (EDA)

2.1. Eksplorasi data customers_df

In [34]: customers_df.describe(include="all")

Out[34]:
customer_id customer_name gender age home_address zip_code city state country

count 1001.000000 1001 1001 1001.000000 1001 1001.000000 1001 1001 1001

unique NaN 1000 3 NaN 1000 NaN 961 8 1

Prefer not to 3117 Heller PlaceSuite New South


top NaN fulan 943 NaN NaN Australia
say 149 Ava Australia

freq NaN 2 743 NaN 2 NaN 3 140 1001

mean 500.942058 NaN NaN 49.874126 NaN 5000.693307 NaN NaN NaN

std 289.013599 NaN NaN 17.644663 NaN 2886.084454 NaN NaN NaN

min 1.000000 NaN NaN 20.000000 NaN 2.000000 NaN NaN NaN

25% 251.000000 NaN NaN 34.000000 NaN 2398.000000 NaN NaN NaN

50% 501.000000 NaN NaN 50.000000 NaN 5079.000000 NaN NaN NaN

75% 751.000000 NaN NaN 65.000000 NaN 7454.000000 NaN NaN NaN

max 1000.000000 NaN NaN 80.000000 NaN 9998.000000 NaN NaN NaN

In [35]: customers_df.groupby(by="gender").agg({
"customer_id": "nunique",
"age": ["max", "min", "mean", "std"]
})

Out[35]:
customer_id age

nunique max min mean std

gender

Female 115 79 20 49.147826 16.646607

Male 143 80 20 51.230769 18.462635

Prefer not to say 742 80 20 49.725437 17.644283

In [36]: customers_df.groupby(by="city").customer_id.nunique().sort_values(ascending=
False)
customers_df.groupby(by="state").customer_id.nunique().sort_values(ascending
=False)

Out[36]: state
South Australia 139
Queensland 134
New South Wales 132
Northern Territory 125
Western Australia 124
Victoria 121
Australian Capital Territory 121
Tasmania 104
Name: customer_id, dtype: int64

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 7/25


01/11/23, 20.29 uts jesiska

2.2. Eksplorasi data orders_df

In [38]: delivery_time = orders_df["delivery_date"] - orders_df["order_date"]


delivery_time = delivery_time.apply(lambda x: x.total_seconds())
orders_df["delivery_time"] = round(delivery_time/86400)

In [39]: orders_df.head()

Out[39]:
order_id customer_id payment order_date delivery_date delivery_time

0 1 64 30811 2021-08-30 2021-09-24 25.0

1 2 473 50490 2021-02-03 2021-02-13 10.0

2 3 774 46763 2021-10-08 2021-11-03 26.0

3 4 433 39782 2021-05-06 2021-05-19 13.0

4 5 441 14719 2021-03-23 2021-03-24 1.0

In [40]: orders_df.describe(include="all")

Out[40]:
order_id customer_id payment order_date delivery_date delivery_time

count 1000.000000 1000.000000 1000.000000 1000 1000 1000.000000

unique NaN NaN NaN 291 305 NaN

top NaN NaN NaN 2021-06-20 00:00:00 2021-08-09 00:00:00 NaN

freq NaN NaN NaN 10 9 NaN

first NaN NaN NaN 2021-01-01 00:00:00 2021-01-03 00:00:00 NaN

last NaN NaN NaN 2021-10-24 00:00:00 2021-11-20 00:00:00 NaN

mean 500.500000 506.640000 33972.936000 NaN NaN 14.078000

std 288.819436 277.115502 14451.609047 NaN NaN 7.707225

min 1.000000 1.000000 10043.000000 NaN NaN 1.000000

25% 250.750000 275.250000 21329.250000 NaN NaN 8.000000

50% 500.500000 515.000000 33697.500000 NaN NaN 14.000000

75% 750.250000 737.250000 46249.000000 NaN NaN 21.000000

max 1000.000000 1000.000000 59910.000000 NaN NaN 27.000000

2.3. Eksplorasi Data orders_df dan customers_df

In [41]: customer_id_in_orders_df = orders_df.customer_id.tolist()


customers_df["status"] = customers_df["customer_id"].apply(lambda x:"Active" if x in customer_id_in_orders_df
customers_df.sample(5)

Out[41]:
customer_id customer_name gender age home_address zip_code city state country status

Prefer not to 733 Cole South Non


244 245 fulan 245 36 3849 Victoria Australia
say CrescentSuite 465 Abbeyshire Active

Prefer not to 100 Stokes


102 103 fulan 103 77 2603 South Nathan South Australia Australia Active
say ParkwayApt. 307

Prefer not to 443 Jesse MewsApt. Non


924 919 fulan 919 64 24 New Imogen Queensland Australia
say 726 Active

1488 Kuhn AvenueApt. New South Non


853 850 fulan 850 Male 55 9923 Lillianside Australia
525 Wales Active

Prefer not to 4155 Hughes


752 750 fulan 750 42 7850 Baumbachton Victoria Australia Active
say RidgeApt. 624

In [42]: customers_df.groupby(by="status").customer_id.count()

Out[42]: status
Active 617
Non Active 384
Name: customer_id, dtype: int64

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 8/25


01/11/23, 20.29 uts jesiska

In [43]: orders_customers_df = pd.merge(left=orders_df,


right=customers_df,
how="left",
left_on="customer_id",
right_on="customer_id"
)
orders_customers_df.head()

Out[43]:
order_id customer_id payment order_date delivery_date delivery_time customer_name gender age home_address zip_code

Prefer 4927 Alice


0 1 64 30811 2021-08-30 2021-09-24 25.0 fulan 64 not to 75 MeadowApt. 7787 Sa
say 960

531 Schmitt
1 2 473 50490 2021-02-03 2021-02-13 10.0 fulan 473 Male 61 BoulevardApt. 1744
010

Prefer
2096 Wilson
2 3 774 46763 2021-10-08 2021-11-03 26.0 fulan 774 not to 34 8590
MewsApt. 714 J
say

Prefer
5777 Mayer
3 4 433 39782 2021-05-06 2021-05-19 13.0 fulan 433 not to 46 9728
PassApt. 881
say

Prefer 33 Richards
4 5 441 14719 2021-03-23 2021-03-24 1.0 fulan 441 not to 53 JunctionApt. 7650 So
say 478

1.3.1 Jumlah order berdasarkan kota

In [44]: orders_customers_df.groupby(by="city").order_id.nunique().sort_values(ascending=False).reset_index().head(10)

Out[44]:
city order_id

0 New Ava 6

1 Jordanside 6

2 Rubyfort 5

3 O'keefeton 5

4 West Kai 5

5 Port Hannahburgh 5

6 East Max 5

7 Lake Rose 5

8 Claudiaview 4

9 Andrewborough 4

1.3.2 Jumlah order berdasarkan state

In [45]: orders_customers_df.groupby(by="state").order_id.nunique().sort_values(ascending=False)

Out[45]: state
South Australia 148
Queensland 139
Western Australia 130
New South Wales 129
Victoria 118
Australian Capital Territory 118
Tasmania 112
Northern Territory 106
Name: order_id, dtype: int64

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 9/25


01/11/23, 20.29 uts jesiska

1.3.3 Jumlah order berdasarkan gender

In [47]: orders_customers_df.groupby(by="gender").order_id.nunique().sort_values(ascending=False)

Out[47]: gender
Prefer not to say 725
Female 139
Male 136
Name: order_id, dtype: int64

1.3.4 Jumlah order berdasarkan kelompok usia

In [48]: orders_customers_df["age_group"] = orders_customers_df.age.apply(lambda x: "Youth" if x <= 24 else ("Seniors"


orders_customers_df.groupby(by="age_group").order_id.nunique().sort_values(ascending=False)

Out[48]: age_group
Adults 681
Seniors 226
Youth 93
Name: order_id, dtype: int64

1.4. Eksplorasi Data product_df dan sales_df


In [49]: product_df.describe(include="all")
sales_df.describe(include="all")

Out[49]:
sales_id order_id product_id price_per_unit quantity total_price

count 5000.000000 5000.000000 5000.000000 5000.000000 5000.00000 5000.000000

mean 2499.500000 503.038200 634.053200 103.501600 1.99240 206.360000

std 1443.520003 285.964418 363.255794 9.195004 0.80751 86.357457

min 0.000000 1.000000 1.000000 90.000000 1.00000 90.000000

25% 1249.750000 258.000000 323.000000 95.000000 1.00000 112.000000

50% 2499.500000 504.500000 635.000000 102.000000 2.00000 204.000000

75% 3749.250000 749.000000 951.000000 112.000000 3.00000 285.000000

max 4999.000000 999.000000 1259.000000 119.000000 3.00000 357.000000

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 10/25


01/11/23, 20.29 uts jesiska

In [50]: product_df.sort_values(by="price", ascending=False)

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 11/25


01/11/23, 20.29 uts jesiska

Out[50]:
product_id product_type product_name size colour price quantity description

704 698 Jacket Parka L violet 119 53 A violet coloured, L sized, Parka Jacket

671 665 Jacket Parka XS red 119 65 A red coloured, XS sized, Parka Jacket

698 692 Jacket Parka M indigo 119 66 A indigo coloured, M sized, Parka Jacket

699 693 Jacket Parka L indigo 119 44 A indigo coloured, L sized, Parka Jacket

700 694 Jacket Parka XL indigo 119 78 A indigo coloured, XL sized, Parka Jacket

701 695 Jacket Parka XS violet 119 59 A violet coloured, XS sized, Parka Jacket

702 696 Jacket Parka S violet 119 67 A violet coloured, S sized, Parka Jacket

703 697 Jacket Parka M violet 119 75 A violet coloured, M sized, Parka Jacket

951 945 Trousers Slim-Fit XS red 119 72 A red coloured, XS sized, Slim-Fit Trousers

952 946 Trousers Slim-Fit S red 119 71 A red coloured, S sized, Slim-Fit Trousers

69 69 Shirt Dress XL violet 119 77 A violet coloured, XL sized, Dress Shirt

68 68 Shirt Dress L violet 119 74 A violet coloured, L sized, Dress Shirt

67 67 Shirt Dress M violet 119 41 A violet coloured, M sized, Dress Shirt

66 66 Shirt Dress S violet 119 69 A violet coloured, S sized, Dress Shirt

65 65 Shirt Dress XS violet 119 63 A violet coloured, XS sized, Dress Shirt

64 64 Shirt Dress XL indigo 119 75 A indigo coloured, XL sized, Dress Shirt

63 63 Shirt Dress L indigo 119 71 A indigo coloured, L sized, Dress Shirt

62 62 Shirt Dress M indigo 119 51 A indigo coloured, M sized, Dress Shirt

61 61 Shirt Dress S indigo 119 40 A indigo coloured, S sized, Dress Shirt

60 60 Shirt Dress XS indigo 119 56 A indigo coloured, XS sized, Dress Shirt

59 59 Shirt Dress XL blue 119 66 A blue coloured, XL sized, Dress Shirt

58 58 Shirt Dress L blue 119 61 A blue coloured, L sized, Dress Shirt

57 57 Shirt Dress M blue 119 48 A blue coloured, M sized, Dress Shirt

56 56 Shirt Dress S blue 119 46 A blue coloured, S sized, Dress Shirt

55 55 Shirt Dress XS blue 119 41 A blue coloured, XS sized, Dress Shirt

697 691 Jacket Parka S indigo 119 53 A indigo coloured, S sized, Parka Jacket

696 690 Jacket Parka XS indigo 119 57 A indigo coloured, XS sized, Parka Jacket

695 689 Jacket Parka XL blue 119 47 A blue coloured, XL sized, Parka Jacket

683 677 Jacket Parka M yellow 119 46 A yellow coloured, M sized, Parka Jacket

673 667 Jacket Parka M red 119 56 A red coloured, M sized, Parka Jacket

... ... ... ... ... ... ... ... ...

1209 1203 Trousers Pleated L yellow 90 62 A yellow coloured, L sized, Pleated Trousers

1208 1202 Trousers Pleated M yellow 90 54 A yellow coloured, M sized, Pleated Trousers

1207 1201 Trousers Pleated S yellow 90 71 A yellow coloured, S sized, Pleated Trousers

1206 1200 Trousers Pleated XS yellow 90 75 A yellow coloured, XS sized, Pleated Trousers

1204 1198 Trousers Pleated L orange 90 53 A orange coloured, L sized, Pleated Trousers

1229 1223 Trousers Pleated L violet 90 48 A violet coloured, L sized, Pleated Trousers

1203 1197 Trousers Pleated M orange 90 57 A orange coloured, M sized, Pleated Trousers

1202 1196 Trousers Pleated S orange 90 72 A orange coloured, S sized, Pleated Trousers

1201 1195 Trousers Pleated XS orange 90 74 A orange coloured, XS sized, Pleated Trousers

1200 1194 Trousers Pleated XL red 90 58 A red coloured, XL sized, Pleated Trousers

1199 1193 Trousers Pleated L red 90 70 A red coloured, L sized, Pleated Trousers

1198 1192 Trousers Pleated M red 90 40 A red coloured, M sized, Pleated Trousers

1212 1206 Trousers Pleated S green 90 55 A green coloured, S sized, Pleated Trousers

1213 1207 Trousers Pleated M green 90 41 A green coloured, M sized, Pleated Trousers

1214 1208 Trousers Pleated L green 90 47 A green coloured, L sized, Pleated Trousers

1215 1209 Trousers Pleated XL green 90 55 A green coloured, XL sized, Pleated Trousers

1216 1210 Trousers Pleated XS blue 90 62 A blue coloured, XS sized, Pleated Trousers

1217 1211 Trousers Pleated S blue 90 60 A blue coloured, S sized, Pleated Trousers

1218 1212 Trousers Pleated M blue 90 66 A blue coloured, M sized, Pleated Trousers

1219 1213 Trousers Pleated L blue 90 79 A blue coloured, L sized, Pleated Trousers

1220 1214 Trousers Pleated XL blue 90 55 A blue coloured, XL sized, Pleated Trousers

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 12/25


01/11/23, 20.29 uts jesiska
product_id product_type product_name size colour price quantity description

1221 1215 Trousers Pleated XS indigo 90 52 A indigo coloured, XS sized, Pleated Trousers

1222 1216 Trousers Pleated S indigo 90 41 A indigo coloured, S sized, Pleated Trousers

1223 1217 Trousers Pleated M indigo 90 48 A indigo coloured, M sized, Pleated Trousers

1224 1218 Trousers Pleated L indigo 90 73 A indigo coloured, L sized, Pleated Trousers

1225 1219 Trousers Pleated XL indigo 90 45 A indigo coloured, XL sized, Pleated Trousers

1226 1220 Trousers Pleated XS violet 90 58 A violet coloured, XS sized, Pleated Trousers

1227 1221 Trousers Pleated S violet 90 50 A violet coloured, S sized, Pleated Trousers

1228 1222 Trousers Pleated M violet 90 45 A violet coloured, M sized, Pleated Trousers

599 593 Jacket Bomber L violet 90 68 A violet coloured, L sized, Bomber Jacket

1260 rows × 8 columns

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 13/25


01/11/23, 20.29 uts jesiska

In [51]: product_df.groupby(by="product_type").agg({
"product_id": "nunique",
"quantity": "sum",
"price": ["min", "max"]
})

product_df.groupby(by="product_name").agg({
"product_id": "nunique",
"quantity": "sum",
"price": ["min", "max"]
})

Out[51]:
product_id quantity price

nunique sum min max

product_name

Bomber 35 2083 90 90

Camp Collared 35 2071 112 112

Cardigan 35 2032 118 118

Cargo Pants 35 2146 106 106

Casual Slim Fit 35 2086 113 113

Chambray 35 2020 105 105

Chinos 35 2101 100 100

Coach 35 2158 115 115

Cords 35 2260 113 113

Cropped 35 2085 99 99

Cuban Collar 35 2017 93 93

Denim 70 4204 92 115

Drawstring 35 2042 104 104

Dress 35 2125 119 119

Flannel 35 2135 96 96

Henley 35 2051 92 92

High-Waisted 35 2198 98 98

Joggers 35 2107 94 94

Leather 35 2276 113 113

Linen 35 2138 116 116

Mandarin Collar 35 2203 108 108

Oxford Cloth 35 2071 114 114

Parka 35 2201 119 119

Peacoat 35 2067 102 102

Pleated 35 2030 90 90

Polo 35 2196 117 117

Puffer 35 1946 110 110

Pullover 35 2184 114 114

Relaxed Leg 35 2002 95 95

Shearling 35 2169 95 95

Slim-Fit 35 2155 119 119

Tracksuit Bottoms 35 2038 91 91

Trench Coat 35 2132 112 112

Windbreaker 35 2085 109 109

Wool 35 1975 111 111

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 14/25


01/11/23, 20.29 uts jesiska

In [52]: sales_product_df = pd.merge(


left=sales_df,
right=product_df,
how="left",
left_on="product_id",
right_on="product_id"
)
sales_product_df.head()

Out[52]:
sales_id order_id product_id price_per_unit quantity_x total_price product_type product_name size colour price quantity_y d

0 0 1 218 106 2 212 Shirt Chambray L orange 105 44

c
1 1 1 481 118 1 118 Jacket Puffer S indigo 110 62

c
2 2 1 2 96 3 288 Shirt Oxford Cloth M red 114 54

c
3 3 1 1002 106 2 212 Trousers Wool M blue 111 52
s

c
4 4 1 691 113 3 339 Jacket Parka S indigo 119 53

In [53]: sales_product_df.groupby(by="product_type").agg({
"sales_id": "nunique",
"quantity_x": "sum",
"total_price": "sum"
})

Out[53]:
sales_id quantity_x total_price

product_type

Jacket 1676 3343 357026

Shirt 1641 3259 333600

Trousers 1683 3360 341174

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 15/25


01/11/23, 20.29 uts jesiska

In [54]: sales_product_df.groupby(by="product_name").agg({
"sales_id": "nunique",
"quantity_x": "sum",
"total_price": "sum"
}).sort_values(by="total_price", ascending=False)

Out[54]:
sales_id quantity_x total_price

product_name

Denim 273 527 52399

Casual Slim Fit 154 306 36414

Trench Coat 146 299 35581

Shearling 150 302 35334

Puffer 140 298 35164

Flannel 141 281 33158

Cropped 135 284 32660

Pleated 147 308 32340

Joggers 164 334 31062

Chambray 141 290 30740

Parka 134 269 30397

Bomber 150 297 29700

Chinos 135 260 29380

Windbreaker 143 287 29274

Cardigan 134 260 29120

Peacoat 132 266 28994

Tracksuit Bottoms 130 257 28784

Slim-Fit 154 288 27936

Oxford Cloth 146 289 27744

Drawstring 147 288 27648

High-Waisted 140 278 27522

Camp Collared 139 279 27342

Cargo Pants 134 267 27234

Relaxed Leg 146 301 27090

Cuban Collar 122 254 26670

Henley 146 281 26414

Wool 125 249 26394

Leather 136 263 25774

Mandarin Collar 124 236 25724

Dress 127 243 25272

Pullover 143 276 25116

Polo 131 264 25080

Coach 134 265 24645

Linen 131 270 24570

Cords 126 246 23124

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 16/25


01/11/23, 20.29 uts jesiska

1.5. Eksplorasi data all_df


In [55]: all_df = pd.merge(
left=sales_product_df,
right=orders_customers_df,
how="left",
left_on="order_id",
right_on="order_id"
)
all_df.head()

Out[55]:
sales_id order_id product_id price_per_unit quantity_x total_price product_type product_name size colour ... customer_name

0 0 1 218 106 2 212 Shirt Chambray L orange ... fulan 64

1 1 1 481 118 1 118 Jacket Puffer S indigo ... fulan 64

2 2 1 2 96 3 288 Shirt Oxford Cloth M red ... fulan 64

3 3 1 1002 106 2 212 Trousers Wool M blue ... fulan 64

4 4 1 691 113 3 339 Jacket Parka S indigo ... fulan 64

5 rows × 28 columns

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 17/25


01/11/23, 20.29 uts jesiska

In [56]: all_df.groupby(by=["state", "product_type"]).agg({


"quantity_x": "sum",
"total_price": "sum"
})

Out[56]:
quantity_x total_price

state product_type

Australian Capital Territory Jacket 406 43204

Shirt 396 40448

Trousers 454 46790

New South Wales Jacket 451 47998

Shirt 431 43980

Trousers 392 39766

Northern Territory Jacket 365 38991

Shirt 336 33865

Trousers 384 38998

Queensland Jacket 499 53511

Shirt 417 42506

Trousers 453 46045

South Australia Jacket 461 49090

Shirt 509 52685

Trousers 455 46041

Tasmania Jacket 412 44370

Shirt 387 39668

Trousers 379 38057

Victoria Jacket 359 38203

Shirt 390 40206

Trousers 414 41948

Western Australia Jacket 390 41659

Shirt 393 40242

Trousers 429 43529

In [57]: all_df.groupby(by=["gender", "product_type"]).agg({


"quantity_x": "sum",
"total_price": "sum"
})

all_df.groupby(by=["age_group", "product_type"]).agg({
"quantity_x": "sum",
"total_price": "sum"
})

Out[57]:
quantity_x total_price

age_group product_type

Adults Jacket 2292 245055

Shirt 2225 227781

Trousers 2272 231271

Seniors Jacket 777 82959

Shirt 733 74977

Trousers 766 77471

Youth Jacket 274 29012

Shirt 301 30842

Trousers 322 32432

In [58]: all_df.to_csv("all_data.csv", index=False)

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 18/25


01/11/23, 20.29 uts jesiska

3. Data Visualization

3.1. Bagaimana Performa Penjualan dan Revenue Perusahaan dalam Beberapa


Bulan Terakhir?
In [61]: monthly_orders_df = all_df.resample(rule='M', on='order_date').agg({
"order_id": "nunique",
"total_price": "sum"
})
monthly_orders_df.index = monthly_orders_df.index.strftime('%Y-%m')
monthly_orders_df = monthly_orders_df.reset_index()
monthly_orders_df.rename(columns={
"order_id": "order_count",
"total_price": "revenue"
}, inplace=True)
monthly_orders_df.head()

Out[61]:
index order_count revenue

0 2021-01 109 119333

1 2021-02 93 95080

2 2021-03 117 131364

3 2021-04 99 97530

4 2021-05 84 85597

In [62]: monthly_orders_df = all_df.resample(rule='M', on='order_date').agg({


"order_id": "nunique",
"total_price": "sum"
})
monthly_orders_df.index = monthly_orders_df.index.strftime('%B') #mengubah format order date menjadi nama bula

monthly_orders_df = monthly_orders_df.reset_index()
monthly_orders_df.rename(columns={
"order_id": "order_count",
"total_price": "revenue"
}, inplace=True)
monthly_orders_df.rename(columns={"index":"order_date"}, inplace=True)
plt.figure(figsize=(10, 5))
plt.plot(monthly_orders_df["order_date"], monthly_orders_df["order_count"],
marker='o', linewidth=2, color="#72BCD4")
plt.title("Number of Orders per Month (2021)", loc="center", fontsize=20)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show();

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 19/25


01/11/23, 20.29 uts jesiska

In [63]: plt.figure(figsize=(10, 5))


plt.plot(
monthly_orders_df["order_date"],
monthly_orders_df["revenue"],
marker='o',
linewidth=2,
color="#72BCD4"
)
plt.title("Total Revenue per Month (2021)", loc="center", fontsize=20)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show();

3.2. Produk Apa yang Paling Banyak dan Paling Sedikit Terjual?
In [64]: sum_order_items_df = all_df.groupby("product_name").quantity_x.sum().sort_values(ascending=False).reset_index
sum_order_items_df.head(15)

Out[64]:
product_name quantity_x

0 Denim 527

1 Joggers 334

2 Pleated 308

3 Casual Slim Fit 306

4 Shearling 302

5 Relaxed Leg 301

6 Trench Coat 299

7 Puffer 298

8 Bomber 297

9 Chambray 290

10 Oxford Cloth 289

11 Drawstring 288

12 Slim-Fit 288

13 Windbreaker 287

14 Cropped 284

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 20/25


01/11/23, 20.29 uts jesiska

In [65]: fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(24, 6));

In [66]: fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(24, 6))

colors = ["#72BCD4", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3"]

sns.barplot(x="quantity_x", y="product_name",
data=sum_order_items_df.head(5), palette=colors, ax=ax[0])
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].set_title("Best Performing Product", loc="center", fontsize=15)
ax[0].tick_params(axis ='y', labelsize=12)

sns.barplot(x="quantity_x", y="product_name",
data=sum_order_items_df.sort_values(by="quantity_x",
ascending=True).head(5), palette=colors, ax=ax[1])
ax[1].set_ylabel(None)
ax[1].set_xlabel(None)
ax[1].invert_xaxis()
ax[1].yaxis.set_label_position("right")
ax[1].yaxis.tick_right()
ax[1].set_title("Worst Performing Product", loc="center", fontsize=15)
ax[1].tick_params(axis='y', labelsize=12)

plt.suptitle("Best and Worst Performing Product by Number of Sales",


fontsize=20)
plt.show();

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 21/25


01/11/23, 20.29 uts jesiska

3.3. Bagaimana Demografi Pelanggan yang Kita Miliki?

3.3.1 berdasarkan gender

In [67]: bygender_df = all_df.groupby(by="gender").customer_id.nunique().reset_index()


bygender_df.rename(columns={
"customer_id": "customer_count"
}, inplace=True)

plt.figure(figsize=(10, 5))

sns.barplot(
y="customer_count",
x="gender",
data=bygender_df.sort_values(by="customer_count",
ascending=False),
palette=colors
)
plt.title("Number of Customer by Gender", loc="center", fontsize=15)
plt.ylabel(None)
plt.xlabel(None)
plt.tick_params(axis='x', labelsize=12)
plt.show();

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 22/25


01/11/23, 20.29 uts jesiska

3.3.2 berdasarkan age

In [68]: byage_df = all_df.groupby(by="age_group").customer_id.nunique().reset_index()


byage_df.rename(columns={
"customer_id": "customer_count"
}, inplace=True)
byage_df
byage_df['age_group'] = pd.Categorical(byage_df['age_group'],
["Youth", "Adults", "Seniors"])
plt.figure(figsize=(10, 5))
colors_ = ["#D3D3D3", "#72BCD4", "#D3D3D3", "#D3D3D3", "#D3D3D3"]

sns.barplot(
y="customer_count",
x="age_group",
data=byage_df.sort_values(by="age_group", ascending=False),
palette=colors_
)
plt.title("Number of Customer by Age", loc="center", fontsize=15)
plt.ylabel(None)
plt.xlabel(None)
plt.tick_params(axis='x', labelsize=12)
plt.show();

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 23/25


01/11/23, 20.29 uts jesiska

3.3.3 berdasarkan state

In [69]: bystate_df = all_df.groupby(by="state").customer_id.nunique().reset_index()


bystate_df.rename(columns={
"customer_id": "customer_count"
}, inplace=True)
bystate_df
plt.figure(figsize=(10, 5))
colors_ = ["#72BCD4", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3",
"#D3D3D3", "#D3D3D3", "#D3D3D3"]
sns.barplot(
x="customer_count",
y="state",
data=bystate_df.sort_values(by="customer_count",
ascending=False),
palette=colors_
)
plt.title("Number of Customer by States", loc="center", fontsize=15)
plt.ylabel(None)
plt.xlabel(None)
plt.tick_params(axis='y', labelsize=12)
plt.show();

4-6 RFM Analysis


In [70]: rfm_df = all_df.groupby(by="customer_id", as_index=False).agg({
"order_date": "max", # mengambil tanggal order terakhir
"order_id": "nunique", # menghitung jumlah order
"total_price": "sum" # menghitung jumlah revenue yang dihasilkan
})
rfm_df.columns = ["customer_id", "max_order_timestamp", "frequency", "monetary"]

# menghitung kapan terakhir pelanggan melakukan transaksi (hari)


rfm_df["max_order_timestamp"] = rfm_df["max_order_timestamp"].dt.date
recent_date = orders_df["order_date"].dt.date.max()
rfm_df["recency"] = rfm_df["max_order_timestamp"].apply(lambda x:
(recent_date - x).days)

rfm_df.drop("max_order_timestamp", axis=1, inplace=True)


rfm_df.head()

Out[70]:
customer_id frequency monetary recency

0 1 3 1641 203

1 7 1 1017 156

2 10 1 270 229

3 11 1 382 149

4 12 1 1551 127

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 24/25


01/11/23, 20.29 uts jesiska

In [71]: fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(30, 6))

colors = ["#72BCD4", "#72BCD4", "#72BCD4", "#72BCD4", "#72BCD4"]

sns.barplot(y="recency", x="customer_id",
data=rfm_df.sort_values(by="recency", ascending=True).head(5),
palette=colors, ax=ax[0])
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].set_title("By Recency (days)", loc="center", fontsize=18)
ax[0].tick_params(axis ='x', labelsize=15)

sns.barplot(y="frequency", x="customer_id",
data=rfm_df.sort_values(by="frequency", ascending=False).head(5),
palette=colors, ax=ax[1])
ax[1].set_ylabel(None)
ax[1].set_xlabel(None)
ax[1].set_title("By Frequency", loc="center", fontsize=18)
ax[1].tick_params(axis='x', labelsize=15)

sns.barplot(y="monetary", x="customer_id",
data=rfm_df.sort_values(by="monetary", ascending=False).head(5),
palette=colors, ax=ax[2])
ax[2].set_ylabel(None)
ax[2].set_xlabel(None)
ax[2].set_title("By Monetary", loc="center", fontsize=18)
ax[2].tick_params(axis='x', labelsize=15)

plt.suptitle("Best Customer Based on RFM Parameters (customer_id)",


fontsize=20)
plt.show();

In [ ]: ​

localhost:8888/notebooks/data_analyst/uts jesiska.ipynb#SEMESTER:V 25/25

You might also like