Download as pdf or txt
Download as pdf or txt
You are on page 1of 3

1/18/2021 Lab 2 - Data Preparation 1.

ipynb - Colaboratory

Lab 2 - Data Preparation 1

#1. Install Apache Spark


!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.6.tgz
!tar xf spark-2.4.7-bin-hadoop2.6.tgz
!pip install -q findspark

#2. Setting environment variable


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.6"

#3. Inisiasi spark


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

#4. Upload file


from google.colab import files
!rm data_telepon_seluler.csv
files.upload()

#5. Load data


from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('data_telepon_seluler.csv',inferSchema=True, header =True, sep=",
dataset.printSchema()

#6. Menampilkan data


dataset.show() #20 data pertama
#dataset.head() #5 data pertama
#dataset.first()#1 data pertama
#dataset.head(10) # 10 data pertama

#7. Cek tipe data


type(dataset)

#8. Menampilkan data


#collect data + metadata
dataset.select('*').collect()
dataset.select('provinsi', '2012').collect()
#show data saja
https://colab.research.google.com/drive/1fJ3YNoDYuQvEV8bcdd5xTkacPysJvFXb#scrollTo=TLrIqNPgcz4B&printMode=true 1/3
1/18/2021 Lab 2 - Data Preparation 1.ipynb - Colaboratory

#collect data + metadata


dataset.select('*').collect()
dataset.select('provinsi', '2012').collect()
#show data saja
dataset.select('*').show()
dataset.select('provinsi', '2012').show()
#take data + metadata sebagian data
dataset.select('*').take(5)
dataset.select('provinsi', '2012').take(5)

#9. Cek tipe data kolom


dataset.select('provinsi')

#10. Distinct
dataset.select('provinsi', '2012').distinct().show()

#11. Menampilkan daftar kolom


dataset.columns

#12. Menampilkan data


dataset.select(dataset.columns[0:3]).show()

#13. Menampilkan data


dataset.show(2,truncate= True)
X = dataset.collect()[0]['2014']
X = dataset.collect()[0][3]

#14. Menampilkan sebagian data


selected_columns = ["provinsi", "kode_wilayah", "2012"]
subset_df_2 = dataset.select(selected_columns[0],selected_columns[1],selected_columns[2])
subset_df_2.head()

#15. Filtering
dataset.filter("provinsi = 'DI YOGYAKARTA'")
dataset.filter("provinsi in ('DI YOGYAKARTA')")

#16. Menampilkan data null


dataset.where(dataset["2012"].isNull()).show()
dataset.where(dataset["2012"].isNotNull()).show(999)

#17. Menampilkan struktur data


https://colab.research.google.com/drive/1fJ3YNoDYuQvEV8bcdd5xTkacPysJvFXb#scrollTo=TLrIqNPgcz4B&printMode=true 2/3
1/18/2021 Lab 2 - Data Preparation 1.ipynb - Colaboratory
p
print((dataset.count(), len(dataset.columns)))

#18. Menampilkan rangkuman data


dataset.describe().show()
dataset.describe("2012").show()

#19. Mengganti tipe kolom


dataset.createOrReplaceTempView("tmpprov")
df4 = spark.sql("SELECT provinsi, int('2012'),int('2013'),int('2014') from tmpprov")
dataset.printSchema()
df4.printSchema()

Copy protected with Online-PDF-No-Copy.com


https://colab.research.google.com/drive/1fJ3YNoDYuQvEV8bcdd5xTkacPysJvFXb#scrollTo=TLrIqNPgcz4B&printMode=true 3/3

You might also like