Professional Documents
Culture Documents
02 Data - Engg - 23-24 Worksheet Practical#5b 1
02 Data - Engg - 23-24 Worksheet Practical#5b 1
Practical #5
to_date()
import org.apache.spark.sql.functions._
Seq(("04/13/2019"))
.toDF("Input")
.select( col("Input"),
to_date(col("Input"), "MM/dd/yyyy").as("to_date")
datediff()
import org.apache.spark.sql.functions._
Seq(("2019-01-23"),("2019-06-24"),("2019-09-20"))
.toDF("input")
.select( col("input"), current_date(),
datediff(current_date(),col("input")).as("diff")
).show()
months_between()
import org.apache.spark.sql.functions._
Seq(("2019-01-23"),("2019-06-24"),("2019-09-20"))
.toDF("date")
.select( col("date"), current_date(),
Vidyalankar School of Information Technology
datediff(current_date(),col("date")).as("datediff"),
months_between(current_date(),col("date")).as("months_between")
).show()
current_timestamp()
import org.apache.spark.sql.functions._
val df = Seq((1)).toDF("seq")
val curDate = df.withColumn("current_date",current_date().as("current_date"))
Vidyalankar School of Information Technology
.withColumn("current_timestamp",current_timestamp().as("current_timestamp"))
curDate.show(false)
to_timestamp()
import org.apache.spark.sql.functions._
val dfDate = Seq(("07-01-2019 12 01 19 406"),
("06-24-2019 12 01 19 406"),
("11-16-2019 16 44 55 406"),
("11-16-2019 16 50 59 406")).toDF("input_timestamp")
dfDate.withColumn("datetype_timestamp",
to_timestamp(col("input_timestamp"),"MM-dd-yyyy HH mm ss SSS"))
.show(false)
json_tuple() – Extract the Data from JSON and create them as new columns
import org.apache.spark.sql.functions.{json_tuple}
df.select(col("id"),json_tuple(col("value"),"Zipcode","ZipCodeType","City"))
.toDF("id","Zipcode","ZipCodeType","City")
.show(false)
get_json_object() – Extracts JSON element from a JSON string based on json path specified
import org.apache.spark.sql.functions.{get_json_object}
df.select(col("id"),get_json_object(col("value"),"$.ZipCodeType").as("ZipCodeType"))
.show(false)
//import org.apache.spark.sql.functions._
println("DataFrame sorting using orderBy() function")
df.orderBy("department","state").show(false) // or df.orderBy(col("department"),col("state")).show(false)
map()
Syntax - map(cols: Column*): Column
val index = df.schema.fieldIndex("properties")
propSchema.fields.foreach(field =>{
columns.add(lit(field.name))
columns.add(col("properties." + field.name))
})
df = df.withColumn("propertiesMap",map(columns.toSeq:_*))
df = df.drop("properties")
df.printSchema()
df.show(false)
//row_number
val windowSpec = Window.partitionBy("department").orderBy("salary")
df.withColumn("row_number",row_number.over(windowSpec))
.show()