Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 9

Section A

1)movies=sc.textFile('file:///home/ubuntu/Desktop/movies.dat',3)

movie=movies.map(lambda x:x.split("#"))

2)

movies.take(5)

3)

tothor=movie.filter(lambda x: x[2]=="Horror")

tothor.count()

4)

users=sc.textFile('file:///home/ubuntu/Desktop/users.dat',2)

user=users.map(lambda x:x.split("#"))
5)

FemaleEmp=user.filter(lambda x: x[1]=="Female").filter(lambda x: x[3]=="Educator")

6)

cd female

cat part-00000

hdfs dfs -put /home/ubuntu/female /user/ubuntu/female

hdfs dfs -cat /user/ubuntu/female/part-00000


7)

FemaleEmp.toDebugString()

8)

ratings=sc.textFile("file:///home/ubuntu/Desktop/ratings.dat")

rating=ratings.map(lambda x:x.split("#"))

9)
Section B

1)

MovieDf=spark.createDataFrame(movie)

MovieDF=MovieDf.toDF('MovieID','Titles','Genres')

2) ratingDF=spark.read.options(inferSchema=True,sep="#",header=False).csv("file:///home/ubuntu/
Desktop/ratings.dat")
ratingsDF=spark.read.format("csv").option("header",False).option("delimiter","#").load("file:///home/
ubuntu/Desktop/ratings.dat")

RatingDF=ratingDF.toDF('UserID','MovieID','Rating','TStamp')
3)

RatingDF.createOrReplaceTempView("RatingView")

MovieDF.createOrReplaceTempView("MovieView")

4)

m1=spark.sql("select * from MovieView")

r1=spark.sql("select * from RatingView")


5)
Section C:

1)

ubuntu@ip-172-31-69-2:~$ cd bigdata/kafka

ubuntu@ip-172-31-69-2:~/bigdata/kafka$ kafka-topics.sh --create --topic Rating --bootstrap-server


localhost:9092 --replication-factor 1 --partitions 2

Created topic Rating.

ubuntu@ip-172-31-69-2:~/bigdata/kafka$ kafka-topics.sh --describe --bootstrap-server localhost:9092 --


topic Rating

Topic: Rating TopicId: XDwz9oVuQiCPIRcXN-5cHw PartitionCount: 2 ReplicationFactor: 1 Configs:

Topic: Rating Partition: 0 Leader: 0 Replicas: 0 Isr: 0

Topic: Rating Partition: 1 Leader: 0 Replicas: 0 Isr: 0

2)

ubuntu@ip-172-31-69-2:~/Desktop$ cat ratings.dat | nc -lk 9999

3)

from pyspark.sql import SparkSession

from pyspark.sql.functions import explode

from pyspark.sql.functions import split

from pyspark.sql.types import *

from pyspark.sql.functions import *


spark = SparkSession.builder.appName("RatingStructuredStreamingDF").getOrCreate()

socketDF = spark.readStream.format("socket").option("host", "localhost").option("port", 9999).load()

RatingDF =
socketDF.select(split(socketDF.value,"#").getItem(0).alias("UserID"),split(socketDF.value,"#").getItem(1).
alias("MovieID"),split(socketDF.value,"#").getItem(2).alias("Rating"),split(socketDF.value,"#").getItem(3)
.alias("Tstamp"),split(socketDF.value,"#"))

RatingDFwithCol=RatingDF.withColumn("UserID",col("UserID").cast(IntegerType())).withColumn("MovieI
D",col("MovieID").cast(IntegerType())).withColumn("Rating",col("Rating").cast(IntegerType())).withColu
mn("Tstamp",col("Tstamp").cast(IntegerType()))

RatingCount = RatingDFwithCol.groupBy("Rating").count()

Ratingcountquery = RatingCount.writeStream.outputMode("complete").option("checkpointLocation",
"file:///home/ubuntu/ratingcount").format("console").start()

Not able to get the desired output because the task size is very large

You might also like