2.RDDs in Spark

You might also like

Download as pptx, pdf, or txt
Download as pptx, pdf, or txt
You are on page 1of 38

Map()

from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext


conf = SparkConf().setAppName("MapFunctionExample")
sc = SparkContext(conf=conf)
# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Define a function to be applied using map


def square(x):
return x * x
# Use the map transformation to apply the function to each element
squared_rdd = rdd.map(square)
# Collect the results into a list and print
squared_list = squared_rdd.collect()
print(squared_list)
# Stop the SparkContext
sc.stop()
filter()
from pyspark import SparkContext, SparkConf
# Create a SparkConf and SparkContext
conf = SparkConf().setAppName("FilterFunctionExample")
sc = SparkContext(conf=conf)
# Create an RDD
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = sc.parallelize(data)
# Define a function to filter even numbers
def is_even(x):
return x % 2 == 0

# Use the filter transformation to select even numbers


even_rdd = rdd.filter(is_even)

# Collect the results into a list and print


even_list = even_rdd.collect()
print(even_list)
# Stop the SparkContext
sc.stop()
flatmap()
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext


conf = SparkConf().setAppName("FlatMapFunctionExample")
sc = SparkContext(conf=conf)

# Create an RDD of words


data = ["Hello", "Spark", "RDD", "FlatMap"]
rdd = sc.parallelize(data)
# Define a function to split a word into characters
def split_characters(word):
return list(word)
# Use the flatMap transformation to split each word into characters
char_rdd = rdd.flatMap(split_characters)
# Collect the results into a list and print
char_list = char_rdd.collect()
print(char_list)
sc.stop()
mappartition()

def sum_partition(iterator):
yield sum(iterator)

rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3) # Creating an


RDD with 3 partitions
sums_rdd = rdd.mapPartitions(sum_partition)

sums_list = sums_rdd.collect()
print(sums_list)
reduceByKey()
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext


conf = SparkConf().setAppName("ReduceByKeyExample")
sc = SparkContext(conf=conf)

# Create an RDD of key-value pairs


data = [("apple", 5), ("banana", 3), ("apple", 2), ("banana", 7), ("orange", 4)]
rdd = sc.parallelize(data)

# Use the reduceByKey transformation to aggregate values for each key


# In this case, we'll use addition as the reduce function
sums_rdd = rdd.reduceByKey(lambda x, y: x + y)

# Collect the results into a list and print


sums_list = sums_rdd.collect()
print(sums_list)
sc.stop()
Union()
from pyspark import SparkContext, SparkConf
# Create a SparkConf and SparkContext
conf = SparkConf().setAppName("UnionFunctionExample")
sc = SparkContext(conf=conf)

# Create two RDDs


rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize([4, 5, 6, 7, 8])

# Use the union transformation to combine the two RDDs


combined_rdd = rdd1.union(rdd2)

# Collect the results into a list and print


combined_list = combined_rdd.collect()
print(combined_list)

# Stop the SparkContext


sc.stop()
first()
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext


conf = SparkConf().setAppName("FirstActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Get the first element of the RDD


first_element = rdd.first()

print("First element:", first_element)

# Stop the SparkContext


sc.stop()
count()
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext


conf = SparkConf().setAppName("CountActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Get the count of elements in the RDD


element_count = rdd.count()

print("Element count:", element_count)

# Stop the SparkContext


sc.stop()
collect()
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext


conf = SparkConf().setAppName("CollectActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Use the collect action to retrieve all elements


all_elements = rdd.collect()

print("All elements:", all_elements)

# Stop the SparkContext


sc.stop()
take()
from pyspark import SparkContext, SparkConf

# Create a SparkConf and SparkContext


conf = SparkConf().setAppName("TakeActionExample")
sc = SparkContext(conf=conf)

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Use the take action to retrieve a specified number of elements


selected_elements = rdd.take(3)

print("Selected elements:", selected_elements)

# Stop the SparkContext


sc.stop()
Another RDD example program
WORD COUNT PROGRAM
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
# Create a Spark context
conf = SparkConf().setAppName("WordFrequency")
sc = SparkContext(conf=conf)
# Read the text file
text_file_path = "data.txt"
text_rdd = sc.textFile(text_file_path)
# Tokenize the text into words
words_rdd = text_rdd.flatMap(lambda line: line.split(" "))

# Count the frequency of each unique word


word_counts_rdd = words_rdd.countByValue()
# Display the word frequencies
for word, count in word_counts_rdd.items():
print(f"{word}: {count}")
# Stop the Spark context
sc.stop()
SORT WORDS

from pyspark import SparkContext


sc = SparkContext("local", "WordSort")
words = ["apple", "banana", "cherry", "date", "elderberry", "fig"]

# Create an RDD from the list of words


wordRDD = sc.parallelize(words)

# Sort the RDD and collect the sorted list of words


sortedWords = wordRDD.sortBy(lambda word: word,
ascending=True).collect()
print(sortedWords)
Set operations on RDD’s

Rdd1=[1,2,3,4]
Rdd2=[3,4,5,6,6,4]
Rdd1.Subtract(Rdd2)
Rdd1.Intersection(Rdd2)
Rdd1.distinct()
Reduce() on RDD’s

from pyspark import SparkConf, SparkContext

# Create a Spark configuration


conf = SparkConf().setAppName("RDDReduceExample").setMaster("local[*]")

# Create a Spark context


sc = SparkContext(conf=conf)

# Sample data: an RDD of integers


numbers_rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Use reduce() to calculate the sum of elements


sum_result = numbers_rdd.reduce(lambda x, y: x + y)

print(f"Sum of elements: {sum_result}")

# Stop the Spark context


sc.stop()
1. In which language is Spark coded?
2. Which are the cluster managers supported by Spark?

3.What is Apache Spark?


a) A distributed storage system b) A distributed streaming platform
c) A fast and general-purpose cluster computing system d) A data visualization tool

4) Which of the following is a machine learning library provided by Spark?


b) TensorFlow b) scikit-learn c) MLlib d) Keras

5) What is the primary programming interface for using Spark in Python?


c) SparkSQL b) SparkR c) PySpark d) SparkPython

6. Which transformation is used to create a new RDD by applying a function to each element of the source RDD?
a) `map()`
b) `filter()`
c) `reduce()`
d) `flatmap()`
7. Which transformation is used to create a new RDD by applying a function that returns an iterator to each element of the
source RDD, and then flattening the results?
a) `map()`
b) `filter()`
c) `reduce()`
d) `flatmap()`
8. Which transformation is used to create a new RDD containing only the elements that satisfy a given condition?
a) `map()`
b) `filter()`
c) `reduce()`
d) `distinct()`

9. Which transformation is used to create a new RDD by combining elements from two RDDs with the same key?
a) `join()`
b) `union()`
c) `groupByKey()`
d) `reduceByKey()`

10. What is the difference between `map()` and `flatmap()` transformations in Spark?
a) `map()` applies a function to each element, while `flatmap()` applies a function returning an iterator to each
element.
b) `map()` creates a new RDD with the same number of elements, while `flatmap()` creates an RDD with fewer
elements.
c) `map()` can only be used with numerical data, while `flatmap()` works with any data type.
d) `map()` is used for parallel execution, while `flatmap()` is used for sequential execution.

11. Which transformation should be used to create a new RDD containing the distinct elements of the source RDD?
**
a) distinct()
b) unique()
c) removeDuplicates()
d) filter()

You might also like