Professional Documents
Culture Documents
Pyspark Sejal Pittala
Pyspark Sejal Pittala
1)movies=sc.textFile('file:///home/ubuntu/Desktop/movies.dat',3)
movie=movies.map(lambda x:x.split("#"))
2)
movies.take(5)
3)
tothor=movie.filter(lambda x: x[2]=="Horror")
tothor.count()
4)
users=sc.textFile('file:///home/ubuntu/Desktop/users.dat',2)
user=users.map(lambda x:x.split("#"))
5)
6)
cd female
cat part-00000
FemaleEmp.toDebugString()
8)
ratings=sc.textFile("file:///home/ubuntu/Desktop/ratings.dat")
rating=ratings.map(lambda x:x.split("#"))
9)
Section B
1)
MovieDf=spark.createDataFrame(movie)
MovieDF=MovieDf.toDF('MovieID','Titles','Genres')
2) ratingDF=spark.read.options(inferSchema=True,sep="#",header=False).csv("file:///home/ubuntu/
Desktop/ratings.dat")
ratingsDF=spark.read.format("csv").option("header",False).option("delimiter","#").load("file:///home/
ubuntu/Desktop/ratings.dat")
RatingDF=ratingDF.toDF('UserID','MovieID','Rating','TStamp')
3)
RatingDF.createOrReplaceTempView("RatingView")
MovieDF.createOrReplaceTempView("MovieView")
4)
1)
ubuntu@ip-172-31-69-2:~$ cd bigdata/kafka
2)
3)
RatingDF =
socketDF.select(split(socketDF.value,"#").getItem(0).alias("UserID"),split(socketDF.value,"#").getItem(1).
alias("MovieID"),split(socketDF.value,"#").getItem(2).alias("Rating"),split(socketDF.value,"#").getItem(3)
.alias("Tstamp"),split(socketDF.value,"#"))
RatingDFwithCol=RatingDF.withColumn("UserID",col("UserID").cast(IntegerType())).withColumn("MovieI
D",col("MovieID").cast(IntegerType())).withColumn("Rating",col("Rating").cast(IntegerType())).withColu
mn("Tstamp",col("Tstamp").cast(IntegerType()))
RatingCount = RatingDFwithCol.groupBy("Rating").count()
Ratingcountquery = RatingCount.writeStream.outputMode("complete").option("checkpointLocation",
"file:///home/ubuntu/ratingcount").format("console").start()
Not able to get the desired output because the task size is very large