2.RDDs in Spark

from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("MapFunctionExample")
sc = SparkContext(conf=conf)
# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Define a function to be applied using map

def square(x):
return x * x
# Use the map transformation to apply the function to each element
squared_rdd = rdd.map(square)
# Collect the results into a list and print
squared_list = squared_rdd.collect()
# Stop the SparkContext
from pyspark import SparkContext, SparkConf
# Create a SparkConf and SparkContext
conf = SparkConf().setAppName("FilterFunctionExample")
sc = SparkContext(conf=conf)
# Create an RDD
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = sc.parallelize(data)
# Define a function to filter even numbers
def is_even(x):
return x % 2 == 0

# Use the filter transformation to select even numbers

even_rdd = rdd.filter(is_even)

# Collect the results into a list and print

even_list = even_rdd.collect()
# Stop the SparkContext
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("FlatMapFunctionExample")
sc = SparkContext(conf=conf)

# Create an RDD of words

data = ["Hello", "Spark", "RDD", "FlatMap"]
rdd = sc.parallelize(data)
# Define a function to split a word into characters
def split_characters(word):
return list(word)
# Use the flatMap transformation to split each word into characters
char_rdd = rdd.flatMap(split_characters)
# Collect the results into a list and print
char_list = char_rdd.collect()

def sum_partition(iterator):
yield sum(iterator)

rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3) # Creating an

RDD with 3 partitions
sums_rdd = rdd.mapPartitions(sum_partition)

sums_list = sums_rdd.collect()
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("ReduceByKeyExample")
sc = SparkContext(conf=conf)

# Create an RDD of key-value pairs

data = [("apple", 5), ("banana", 3), ("apple", 2), ("banana", 7), ("orange", 4)]
rdd = sc.parallelize(data)

# Use the reduceByKey transformation to aggregate values for each key

# In this case, we'll use addition as the reduce function
sums_rdd = rdd.reduceByKey(lambda x, y: x + y)

# Collect the results into a list and print

sums_list = sums_rdd.collect()
from pyspark import SparkContext, SparkConf
# Create a SparkConf and SparkContext
conf = SparkConf().setAppName("UnionFunctionExample")
sc = SparkContext(conf=conf)

# Create two RDDs

rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize([4, 5, 6, 7, 8])

# Use the union transformation to combine the two RDDs

combined_rdd = rdd1.union(rdd2)

# Collect the results into a list and print

combined_list = combined_rdd.collect()

# Stop the SparkContext

from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("FirstActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Get the first element of the RDD

first_element = rdd.first()

print("First element:", first_element)

# Stop the SparkContext

from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("CountActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Get the count of elements in the RDD

element_count = rdd.count()

print("Element count:", element_count)

# Stop the SparkContext

from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("CollectActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Use the collect action to retrieve all elements

all_elements = rdd.collect()

print("All elements:", all_elements)

# Stop the SparkContext

from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("TakeActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Use the take action to retrieve a specified number of elements

selected_elements = rdd.take(3)

print("Selected elements:", selected_elements)

# Stop the SparkContext

Another RDD example program
import findspark
from pyspark import SparkContext, SparkConf
# Create a Spark context
conf = SparkConf().setAppName("WordFrequency")
sc = SparkContext(conf=conf)
# Read the text file
text_file_path = "data.txt"
text_rdd = sc.textFile(text_file_path)
# Tokenize the text into words
words_rdd = text_rdd.flatMap(lambda line: line.split(" "))

# Count the frequency of each unique word

word_counts_rdd = words_rdd.countByValue()
# Display the word frequencies
for word, count in word_counts_rdd.items():
print(f"{word}: {count}")
# Stop the Spark context

from pyspark import SparkContext

sc = SparkContext("local", "WordSort")
words = ["apple", "banana", "cherry", "date", "elderberry", "fig"]

# Create an RDD from the list of words

wordRDD = sc.parallelize(words)

# Sort the RDD and collect the sorted list of words

sortedWords = wordRDD.sortBy(lambda word: word,
Set operations on RDD’s

Reduce() on RDD’s

from pyspark import SparkConf, SparkContext

# Create a Spark configuration

conf = SparkConf().setAppName("RDDReduceExample").setMaster("local[*]")

# Create a Spark context

sc = SparkContext(conf=conf)

# Sample data: an RDD of integers

numbers_rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Use reduce() to calculate the sum of elements

sum_result = numbers_rdd.reduce(lambda x, y: x + y)

print(f"Sum of elements: {sum_result}")

# Stop the Spark context

1. In which language is Spark coded?
2. Which are the cluster managers supported by Spark?

3.What is Apache Spark?

a) A distributed storage system b) A distributed streaming platform
c) A fast and general-purpose cluster computing system d) A data visualization tool

4) Which of the following is a machine learning library provided by Spark?

b) TensorFlow b) scikit-learn c) MLlib d) Keras

5) What is the primary programming interface for using Spark in Python?

c) SparkSQL b) SparkR c) PySpark d) SparkPython

6. Which transformation is used to create a new RDD by applying a function to each element of the source RDD?
a) `map()`
b) `filter()`
c) `reduce()`
d) `flatmap()`
7. Which transformation is used to create a new RDD by applying a function that returns an iterator to each element of the
source RDD, and then flattening the results?
a) `map()`
b) `filter()`
c) `reduce()`
d) `flatmap()`
8. Which transformation is used to create a new RDD containing only the elements that satisfy a given condition?
a) `map()`
b) `filter()`
c) `reduce()`
d) `distinct()`

9. Which transformation is used to create a new RDD by combining elements from two RDDs with the same key?
a) `join()`
b) `union()`
c) `groupByKey()`
d) `reduceByKey()`

10. What is the difference between `map()` and `flatmap()` transformations in Spark?
a) `map()` applies a function to each element, while `flatmap()` applies a function returning an iterator to each
b) `map()` creates a new RDD with the same number of elements, while `flatmap()` creates an RDD with fewer
c) `map()` can only be used with numerical data, while `flatmap()` works with any data type.
d) `map()` is used for parallel execution, while `flatmap()` is used for sequential execution.

11. Which transformation should be used to create a new RDD containing the distinct elements of the source RDD?
a) distinct()
b) unique()
c) removeDuplicates()
d) filter()

