2.RDDs in Spark

Map()
from pyspark import SparkContext, SparkConf
# Create a SparkConf and SparkContext

conf = SparkConf().setAppName("MapFunctionExample")
sc = SparkContext(conf=conf)
# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)
# Define a function to be applied using map

def square(x):
return x * x
# Use the map transformation to apply the function to each element
squared_rdd = rdd.map(square)
# Collect the results into a list and print
squared_list = squared_rdd.collect()
print(squared_list)
# Stop the SparkContext
sc.stop()
filter()
conf = SparkConf().setAppName("FilterFunctionExample")
# Create an RDD
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# Define a function to filter even numbers
def is_even(x):
return x % 2 == 0
# Use the filter transformation to select even numbers

even_rdd = rdd.filter(is_even)

even_list = even_rdd.collect()
print(even_list)
sc.stop()
flatmap()

conf = SparkConf().setAppName("FlatMapFunctionExample")
# Create an RDD of words

data = ["Hello", "Spark", "RDD", "FlatMap"]
# Define a function to split a word into characters
def split_characters(word):
return list(word)
# Use the flatMap transformation to split each word into characters
char_rdd = rdd.flatMap(split_characters)
char_list = char_rdd.collect()
print(char_list)
sc.stop()
mappartition()
def sum_partition(iterator):
yield sum(iterator)
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3) # Creating an

RDD with 3 partitions
sums_rdd = rdd.mapPartitions(sum_partition)
sums_list = sums_rdd.collect()
print(sums_list)
reduceByKey()

conf = SparkConf().setAppName("ReduceByKeyExample")
# Create an RDD of key-value pairs

data = [("apple", 5), ("banana", 3), ("apple", 2), ("banana", 7), ("orange", 4)]
# Use the reduceByKey transformation to aggregate values for each key

# In this case, we'll use addition as the reduce function
sums_rdd = rdd.reduceByKey(lambda x, y: x + y)

sums_list = sums_rdd.collect()
print(sums_list)
sc.stop()
Union()
conf = SparkConf().setAppName("UnionFunctionExample")
# Create two RDDs

rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize([4, 5, 6, 7, 8])
# Use the union transformation to combine the two RDDs

combined_rdd = rdd1.union(rdd2)

combined_list = combined_rdd.collect()
print(combined_list)

sc.stop()
first()

conf = SparkConf().setAppName("FirstActionExample")
# Create an RDD
data = [1, 2, 3, 4, 5]
# Get the first element of the RDD

first_element = rdd.first()
print("First element:", first_element)

sc.stop()
count()

conf = SparkConf().setAppName("CountActionExample")
# Create an RDD
data = [1, 2, 3, 4, 5]
# Get the count of elements in the RDD

element_count = rdd.count()
print("Element count:", element_count)

sc.stop()
collect()

conf = SparkConf().setAppName("CollectActionExample")
# Create an RDD
data = [1, 2, 3, 4, 5]
# Use the collect action to retrieve all elements

all_elements = rdd.collect()
print("All elements:", all_elements)

sc.stop()
take()

conf = SparkConf().setAppName("TakeActionExample")
# Create an RDD
data = [1, 2, 3, 4, 5]
# Use the take action to retrieve a specified number of elements

selected_elements = rdd.take(3)
print("Selected elements:", selected_elements)

sc.stop()
Another RDD example program
WORD COUNT PROGRAM
import findspark
findspark.init()
# Create a Spark context
conf = SparkConf().setAppName("WordFrequency")
# Read the text file
text_file_path = "data.txt"
text_rdd = sc.textFile(text_file_path)
# Tokenize the text into words
words_rdd = text_rdd.flatMap(lambda line: line.split(" "))
# Count the frequency of each unique word

word_counts_rdd = words_rdd.countByValue()
# Display the word frequencies
for word, count in word_counts_rdd.items():
print(f"{word}: {count}")
# Stop the Spark context
sc.stop()
SORT WORDS
from pyspark import SparkContext

sc = SparkContext("local", "WordSort")
words = ["apple", "banana", "cherry", "date", "elderberry", "fig"]
# Create an RDD from the list of words

wordRDD = sc.parallelize(words)
# Sort the RDD and collect the sorted list of words

sortedWords = wordRDD.sortBy(lambda word: word,
ascending=True).collect()
print(sortedWords)
Set operations on RDD’s
Rdd1=[1,2,3,4]
Rdd2=[3,4,5,6,6,4]
Rdd1.Subtract(Rdd2)
Rdd1.Intersection(Rdd2)
Rdd1.distinct()
Reduce() on RDD’s
from pyspark import SparkConf, SparkContext
# Create a Spark configuration

conf = SparkConf().setAppName("RDDReduceExample").setMaster("local[*]")
# Create a Spark context

# Sample data: an RDD of integers

numbers_rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# Use reduce() to calculate the sum of elements

sum_result = numbers_rdd.reduce(lambda x, y: x + y)
print(f"Sum of elements: {sum_result}")
# Stop the Spark context

sc.stop()
1. In which language is Spark coded?
2. Which are the cluster managers supported by Spark?
3.What is Apache Spark?

a) A distributed storage system b) A distributed streaming platform
c) A fast and general-purpose cluster computing system d) A data visualization tool
4) Which of the following is a machine learning library provided by Spark?

b) TensorFlow b) scikit-learn c) MLlib d) Keras
5) What is the primary programming interface for using Spark in Python?

c) SparkSQL b) SparkR c) PySpark d) SparkPython
6. Which transformation is used to create a new RDD by applying a function to each element of the source RDD?
a) `map()`
b) `filter()`
c) `reduce()`
d) `flatmap()`
7. Which transformation is used to create a new RDD by applying a function that returns an iterator to each element of the
source RDD, and then flattening the results?
a) `map()`
b) `filter()`
c) `reduce()`
d) `flatmap()`
8. Which transformation is used to create a new RDD containing only the elements that satisfy a given condition?
a) `map()`
b) `filter()`
c) `reduce()`
d) `distinct()`
9. Which transformation is used to create a new RDD by combining elements from two RDDs with the same key?
a) `join()`
b) `union()`
c) `groupByKey()`
d) `reduceByKey()`
10. What is the difference between `map()` and `flatmap()` transformations in Spark?
a) `map()` applies a function to each element, while `flatmap()` applies a function returning an iterator to each
element.
b) `map()` creates a new RDD with the same number of elements, while `flatmap()` creates an RDD with fewer
elements.
c) `map()` can only be used with numerical data, while `flatmap()` works with any data type.
d) `map()` is used for parallel execution, while `flatmap()` is used for sequential execution.
11. Which transformation should be used to create a new RDD containing the distinct elements of the source RDD?
**
a) distinct()
b) unique()
c) removeDuplicates()
d) filter()

2.RDDs in Spark

Uploaded by

Copyright:

Available Formats

You might also like

2.RDDs in Spark

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

2.RDDs in Spark

Uploaded by

Copyright:

Available Formats

Map()

from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext

# Define a function to be applied using map

# Use the filter transformation to select even numbers

# Collect the results into a list and print

# Create a SparkConf and SparkContext

# Create an RDD of words

rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3) # Creating an

# Create a SparkConf and SparkContext

# Create an RDD of key-value pairs

# Use the reduceByKey transformation to aggregate values for each key

# Collect the results into a list and print

# Create two RDDs

# Use the union transformation to combine the two RDDs

# Collect the results into a list and print

# Stop the SparkContext

# Create a SparkConf and SparkContext

# Get the first element of the RDD

print("First element:", first_element)

# Stop the SparkContext

# Create a SparkConf and SparkContext

# Get the count of elements in the RDD

print("Element count:", element_count)

# Stop the SparkContext

# Create a SparkConf and SparkContext

# Use the collect action to retrieve all elements

print("All elements:", all_elements)

# Stop the SparkContext

# Create a SparkConf and SparkContext

# Use the take action to retrieve a specified number of elements

print("Selected elements:", selected_elements)

# Stop the SparkContext

# Count the frequency of each unique word

from pyspark import SparkContext

# Create an RDD from the list of words

# Sort the RDD and collect the sorted list of words

from pyspark import SparkConf, SparkContext

# Create a Spark configuration

# Create a Spark context

# Sample data: an RDD of integers

# Use reduce() to calculate the sum of elements

print(f"Sum of elements: {sum_result}")

# Stop the Spark context

3.What is Apache Spark?

4) Which of the following is a machine learning library provided by Spark?

5) What is the primary programming interface for using Spark in Python?

You might also like