Download as pdf or txt
Download as pdf or txt
You are on page 1of 2

5/30/23, 1:43 PM test1.

ipynb - Colaboratory

Membaca data

#membaca data
import pandas as pd

# Path file CSV


file_path = "C:/Users/bagus/Downloads/data.csv"

# Membaca file CSV dengan encoding latin1


data = pd.read_csv(file_path, encoding='latin1')

melihat dimensi data (baris, kolom)

#melihat dimensi data (baris, kolom)


print('Dataframe dimensions:', data.shape)

Dataframe dimensions: (541909, 8)

mengecek null values masing masing kolom

# mengecek null values masing masing kolom


tab_info=pd.DataFrame(data.dtypes).T.rename(index={0:'column type'})
tab_info=tab_info.append(pd.DataFrame(data.isnull().sum()).T.rename(index={0:'null values (nb)'}))
tab_info=tab_info.append(pd.DataFrame(data.isnull().sum()/data.shape[0]*100).T.
rename(index={0:'null values (%)'}))
display(tab_info)

C:\Users\bagus\AppData\Local\Temp\ipykernel_64872\2220007537.py:3: FutureWarning: The frame.append method is deprecated and will be


tab_info=tab_info.append(pd.DataFrame(data.isnull().sum()).T.rename(index={0:'null values (nb)'}))
C:\Users\bagus\AppData\Local\Temp\ipykernel_64872\2220007537.py:4: FutureWarning: The frame.append method is deprecated and will be
tab_info=tab_info.append(pd.DataFrame(data.isnull().sum()/data.shape[0]*100).T.
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country

column type object object object int64 object float64 float64 object

null values (nb) 0 0 1454 0 0 0 135080 0

null values (%) 0.0 0.0 0.268311 0.0 0.0 0.0 24.926694 0.0

menghapus record data yg ada null values pada kolom customerID dan Description

#menghapus data yg ada null values


data.dropna(axis = 0, subset = ['CustomerID'], inplace = True)
data.dropna(axis = 0, subset = ['Description'], inplace = True)
print('Dataframe dimensions:', data.shape)

Dataframe dimensions: (406829, 8)

mengecek dan menghapus data duplikat

#mengecek dan menghapus data duplikat


print('data duplikat: {}'.format(data.duplicated().sum()))
data.drop_duplicates(inplace = True)

data duplikat: 5225

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 401604 non-null object
1 StockCode 401604 non-null object
2 Description 401604 non-null object
3 Quantity 401604 non-null int64
4 InvoiceDate 401604 non-null object
5 UnitPrice 401604 non-null float64
6 CustomerID 401604 non-null float64
7 Country 401604 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 27.6+ MB

https://colab.research.google.com/drive/1RN8ynXJgi1K9A3NwpVHhfch6f8XN0BHK#printMode=true 1/2
5/30/23, 1:43 PM test1.ipynb - Colaboratory

Mengonversi kolom 'InvoiceDate' menjadi tipe data datetime, dan menampilkan rentang waktu data

# Mengonversi kolom 'InvoiceDate' menjadi tipe data datetime


data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

# Mendapatkan tanggal terawal dan terakhir dalam kolom 'InvoiceDate'


start_date = data['InvoiceDate'].min()
end_date = data['InvoiceDate'].max()

# Menampilkan rentang waktu


print("Rentang waktu InvoiceDate:")
print("Mulai dari:", start_date)
print("Hingga:", end_date)

Rentang waktu InvoiceDate:


Mulai dari: 2010-12-01 08:26:00
Hingga: 2011-12-09 12:50:00

menyimpan data yang sudah dibersihkan ke format CSV

#menyimpan data yang sudah dibersihkan ke format CSV


data.to_csv('data_baru.csv', index=False)

memeriksa rangkuman informasi dataset yang sudah dibersihkan

#memeriksa rangkuman informasi dataset yang sudah dibersihkan


data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 401604 non-null object
1 StockCode 401604 non-null object
2 Description 401604 non-null object
3 Quantity 401604 non-null int64
4 InvoiceDate 401604 non-null datetime64[ns]
5 UnitPrice 401604 non-null float64
6 CustomerID 401604 non-null float64
7 Country 401604 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.6+ MB

Colab paid products - Cancel contracts here

https://colab.research.google.com/drive/1RN8ynXJgi1K9A3NwpVHhfch6f8XN0BHK#printMode=true 2/2

You might also like