Download as pdf or txt
Download as pdf or txt
You are on page 1of 2

system("java -version")

# Load necessary libraries


library(sparklyr)
library(dplyr)
library(ggplot2)

spark_install()

# Connect to Spark cluster


sc <- spark_connect(master = "local")

# Create a Spark DataFrame from the mtcars dataset


mtcars_spark <- copy_to(sc, mtcars, overwrite = TRUE)
mtcars_spark
# Summary statistics
summary_stats <- mtcars_spark %>%
summarize(
avg_mpg = mean(mpg),
avg_hp = mean(hp),
avg_wt = mean(wt),
max_mpg = max(mpg),
max_hp = max(hp),
max_wt = max(wt)
)

print(summary_stats, na.rm = TRUE)

# Scatter plot
mtcars_spark %>%
collect() %>%
ggplot(aes(x = hp, y = mpg)) +
geom_point() +
ggtitle("Horsepower vs. MPG") +
xlab("Horsepower") +
ylab("MPG")

# Linear regression model


linear_model <- mtcars_spark %>%
ml_linear_regression(response = "mpg", features = c("hp", "wt"))

# Model summary
summary(linear_model)

# Make predictions using the linear regression model


linear_predictions <- ml_predict(linear_model, mtcars_spark)

# Display linear regression predictions


head(linear_predictions)

#########################################
# K-means clustering model
k <- 3
kmeans_model <- mtcars_spark %>%
ml_kmeans(k = k, features = c("hp", "wt"))

# Make predictions (assign clusters)


clustered_data <- mtcars_spark %>%
ml_predict(kmeans_model) %>%
mutate(cluster = as.factor(prediction + 1)) # Adding 1 because cluster
indices start from 0 in sparklyr
# Display clustered data
head(clustered_data)

# Visualization of clusters
clustered_data %>%
collect() %>%
ggplot(aes(x = hp, y = wt, color = cluster)) +
geom_point() +
ggtitle("K-Means Clustering (k=3)") +
xlab("Horsepower") +
ylab("Weight")

# Disconnect from the Spark cluster


spark_disconnect(sc)

You might also like