Da Lab File

11/21/23, 5:30 PM notebook71ba53d7b2 - Jupyter Notebook
Q1 and Q2. Basic Operations, Variable Creation

and Typesof Operators
In [ ]:  # Addition
result_addition <- 10 + 5
print(result_addition) # Output: 15

# Subtraction
result_subtraction <- 20 - 8
print(result_subtraction) # Output: 12

# Multiplication
result_multiplication <- 6 * 4
print(result_multiplication) # Output: 24

# Division
result_division <- 15 / 3
print(result_division) # Output: 5

# Exponentiation
result_exponentiation <- 2^3
print(result_exponentiation) # Output: 8
In [ ]:  # Creating variables
a <- 10
b <- 5.5
text <- "Hello, World!"

# Printing variables
print(a)
print(b)
print(text)
localhost:8889/notebooks/Downloads/notebook71ba53d7b2.ipynb# 1/33
In [ ]:  # Assignment
x <- 15
y <- x + 5
print(y) # Output: 20

# Comparison
p <- 10
q <- 20

# Greater than
print(p > q) # Output: FALSE

# Less than or equal to
print(p <= q) # Output: TRUE

# Equal to
print(p == q) # Output: FALSE

# Logical
r <- TRUE
s <- FALSE

# AND
print(r & s) # Output: FALSE

# OR
print(r | s) # Output: TRUE

# NOT
print(!r)
Q3. Creating various Data Structures

Creating Vector
In [ ]:  # Numeric vector
numeric_vector <- c(1, 2, 3, 4, 5)

# Character vector
char_vector <- c("apple", "banana", "orange")

# Logical vector
logical_vector <- c(TRUE, FALSE, TRUE)
Manipulating Vector
In [ ]:  # Accessing elements
print(numeric_vector[3]) # Output: 3

# Adding elements
numeric_vector <- c(numeric_vector, 6, 7)

# Vector operations
sum_result <- sum(numeric_vector)
mean_result <- mean(numeric_vector)
Creating Matrix
In [ ]:  # Accessing elements
print(numeric_vector[3]) # Output: 3

# Adding elements
numeric_vector <- c(numeric_vector, 6, 7)

# Vector operations
sum_result <- sum(numeric_vector)
mean_result <- mean(numeric_vector)
Dataframe
In [ ]:  # Creating a data frame

df <- data.frame(
name = c("John", "Alice", "Bob"),
age = c(25, 30, 28),
is_student = c(FALSE, TRUE, FALSE)
)

# Accessing elements
print(df$name) # Output: John, Alice, Bob

# Adding columns
df$city <- c("New York", "San Francisco", "Chicago")
Q4. Install and Load R packages

Installing
In [ ]:  install.packages("ggplot2")
Loading
In [ ]:  install.packages("ggplot2")
Checking Installed packages
In [ ]:  installed.packages()
Q5. Data Manipulation using dplyr

In [ ]:  # Load required packages
library(dplyr)

# Creating a sample data frame
data <- data.frame(
ID = c(1, 2, 3, 4, 5),
Name = c("Alice", "Bob", "Charlie", "David", "Eva"),
Age = c(25, 30, 22, 35, 28),
Department = c("HR", "Finance", "IT", "HR", "Marketing"),
Salary = c(50000, 60000, 55000, 52000, 58000)
)
In [ ]:  # Select specific columns

selected_data <- select(data, ID, Name, Age)

# Filter rows based on condition
filtered_data <- filter(data, Age > 25)

# Add a new column (e.g., calculating Bonus based on Salary)
mutated_data <- mutate(data, Bonus = Salary * 0.1)

# Group data by Department
grouped_data <- group_by(data, Department)

# Calculate mean salary per Department
summary_data <- summarise(grouped_data, Mean_Salary = mean(Salary))

# Arrange data by Age in ascending order
arranged_data <- arrange(data, Age)

# Select specific columns
selected_data <- select(data, ID, Name, Age)

# Chain multiple operations together
chained_data <- data %>%
filter(Age > 25) %>%
select(ID, Name, Age) %>%
arrange(desc(Age))

In [ ]:  print(selected_data)
print(filtered_data)
print(mutated_data)
print(grouped_data)
print(summary_data)
print(arranged_data)
print(chained_data)

Q6. Clean Data Using Tidying

In [ ]:  library(tidyr)

In [ ]:  # Creating a messy data frame

messy_data <- data.frame(
Name = c("Alice", "Bob", "Charlie"),
Math = c(85, 75, 90),
Science = c(NA, 80, 88),
English = c(78, 82, NA)
)

# Print the messy data
print(messy_data)

Type Markdown and LaTeX: 𝛼2

In [ ]:  # Tidying the data using gather
tidy_data <- gather(messy_data, Subject, Score, -Name, na.rm = TRUE)

# Print the tidy data
print(tidy_data)

Q7. Create different types of plots using

ggplot2
In [ ]:  library(ggplot2)
In [ ]:  library(ggplot2)

# Create a scatter plot
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
labs(title = "Scatter Plot", x = "Weight", y = "Miles Per Gallon")

In [ ]:  # Create a line plot

ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, color = Species)) +
geom_line() +
labs(title = "Line Plot", x = "Sepal Length", y = "Petal Length")

In [ ]:  # Create a bar plot

ggplot(iris, aes(x = Species, y = Sepal.Width, fill = Species)) +
geom_bar(stat = "summary", fun = "mean") +
labs(title = "Bar Plot", x = "Species", y = "Mean Sepal Width")

In [ ]:  # Create a boxplot
ggplot(iris, aes(x = Species, y = Petal.Width, fill = Species)) +
geom_boxplot() +
labs(title = "Boxplot", x = "Species", y = "Petal Width")

In [ ]:  # Create a histogram
ggplot(mtcars, aes(x = mpg)) +
geom_histogram(binwidth = 2, fill = "blue", color = "black") +
labs(title = "Histogram of MPG", x = "Miles Per Gallon", y = "Frequency")

In [ ]:  # Create a density plot

ggplot(iris, aes(x = Sepal.Length, fill = Species, color = Species)) +
geom_density(alpha = 0.6) +
labs(title = "Density Plot", x = "Sepal Length", y = "Density")

In [ ]:  # Create a violin plot

ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
geom_violin() +
labs(title = "Violin Plot", x = "Species", y = "Petal Length")

Q8. Perform descriptive statistics and

hypothesis testing
Descriptive Statistics
In [ ]:  # Summary statistics for a numeric variable

summary(iris$Sepal.Length)
In [ ]:  # Mean
mean(iris$Sepal.Length)

# Median
median(iris$Sepal.Length)

# Variance
var(iris$Sepal.Length)

# Standard Deviation
sd(iris$Sepal.Length)

In [ ]:  # Correlation matrix
cor(iris[, 1:4])

Hypothesis Testing
In [ ]:  # One-sample t-test
t.test(iris$Sepal.Length, mu = 5.8) # Testing if the mean is significantly

# Two-sample t-test
t.test(iris$Sepal.Length ~ iris$Species) # Testing if Sepal Length differs

In [ ]:  # Chi-square test for independence

chisq.test(table(iris$Species, iris$Petal.Length > 4)) # Testing the indep

In [ ]:  # One-way ANOVA
anova_model <- aov(Sepal.Length ~ Species, data = iris)
summary(anova_model)
Q9&Q10. Import and Export data from csv file

Advanced Data Manipulation
In [ ]:  # Using readr package to read data from a CSV file
library(readr)

# Read data from a CSV file
data_from_csv <- read_csv("/kaggle/input/titanic/test.csv")
In [ ]:  # Count missing values in each column of the dataset

missing_count_per_column <- colSums(is.na(data_from_csv))
missing_count_per_column
Removing Missing Values
In [ ]:  # Removing rows with any NA values

data_without_na <- na.omit(data_from_csv)

# Removing columns with any NA values
data_without_na_columns <- data_from_csv[, colSums(is.na(data_from_csv)) ==
In [ ]:  # Imputing missing values with mean

data_from_csv$Age[is.na(data_from_csv$Age)] <- mean(data_from_csv$Age, na.r

# data_from_csv$Cabin[is.na(data_from_csv$Cabin)] <- mean(data_from_csv$Cab

In [ ]:  # Converting data from wide to long format

long_data <- pivot_longer(data_from_csv, cols = -c(Pclass, Name, Sex, Parch

# Converting data from long to wide format
wide_data <- pivot_wider(long_data, names_from = "New_Column_Name", values_

In [ ]:  head(long_data)
In [ ]:  head(wide_data)
In [ ]:  # Filtering based on multiple conditions

filtered_data <- data_from_csv %>%
filter(Age > 50 & Sex == "male")
Using dplyr for complex operations
In [ ]:  # Example chaining operations to filter, mutate, and arrange data

processed_data <- data_from_csv %>%
filter(Age > 50) %>%
mutate(New_Column = Fare * 2) %>%
arrange(desc(New_Column))

Q11. Create a report with R Markdown.

1. Install and Load Required Packages: You need to have the rmarkdown package installed
and loaded to create R Markdown documents. #Install and load the rmarkdown package
install.packages("rmarkdown") library(rmarkdown)
2. Create a New R Markdown Document: In RStudio, you can create a new R Markdown
document by going to File > New File > R Markdown.... This will open a dialog where you can
configure your R Markdown document.
3. Author Your R Markdown Document: In the R Markdown document, you can include a mix
of text, code chunks, and Markdown formatting. Here's an example of a simple R Markdown
document:
title: "Sample R Markdown Report" output: html_document: toc: true
#Introduction This is a sample R Markdown report. We'll include some code and plots in this
report.
#Data Loading
```{r} #Load data data <- read.csv("data.csv") head(data)
Data Summary #Summary statistics summary(data)
Data Visualization #Create a scatter plot library(ggplot2) ggplot(data, aes(x = X, y = Y)) +

geom_point() Conclusion In this example:
The title field in the YAML header sets the title of your report.
Under the "Data Loading" section, there's an R code chunk that loads data from a CSV file.
In the "Data Summary" section, another R code chunk provides summary statistics of the
data.
The "Data Visualization" section contains R code for creating a scatter plot using the
ggplot2 package.
The report includes text sections along with code chunks that can be executed to generate
results and visualizations.
4. Knit Your R Markdown Document: To generate the report, click the "Knit" button in RStudio,
or use the knit() function in R with your R Markdown file as an argument. This will run the
code chunks and produce an HTML (or other format) report.
5. View and Share Your Report: Once the knitting process is complete, you can view the
generated report. It will include your text, code results, and plots, making it easy to communicate
your data analysis in a comprehensive document. You can save the HTML or other output
formats and share them as needed. R Markdown is a versatile tool for creating dynamic and
reproducible reports, and you can customize your documents to include various elements like
tables, LaTeX equations, citations, and more. For advanced formatting and customization, refer
to the R Markdown documentation and cheat sheets.
Q12. Understand the concept of machine

learning and its types.
Machine learning is a subfield of artificial intelligence (AI) that focuses on the development of
algorithms and statistical models that enable computer systems to learn from and make
predictions or decisions based on data. The primary goal of machine learning is to develop
models that can identify patterns, extract insights, and make predictions or decisions without
being explicitly programmed. Here's an overview of the key concepts and types of machine
learning:
Key Concepts in Machine Learning:
Data: Machine learning relies on data, which serves as the primary source of information for
training, testing, and validating models. Data can come in various forms, including structured
(e.g., tables), unstructured (e.g., text or images), and semi-structured (e.g., JSON or XML).
Features: Features are characteristics or attributes extracted from the data that the model uses
for learning. Features play a crucial role in model performance.
Labels: In supervised learning, models are trained with labeled data, where the correct output
or category is known. The model learns to map inputs (features) to corresponding outputs
(labels).
Training: The training phase involves feeding the model with a dataset and adjusting its internal
parameters to minimize the difference between its predictions and the actual labels.
Testing and Validation: After training, the model is tested or validated using a separate dataset
to evaluate its performance and generalization to new, unseen data.
Predictions and Decisions: Once trained, a machine learning model can make predictions on
new, unlabeled data or make decisions based on the learned patterns.
Types of Machine Learning:
Machine learning can be broadly categorized into three main types, based on the learning
approach and the availability of labeled data:
Supervised Learning: In supervised learning, models are trained on labeled data, where both
input features and their corresponding output labels are known. The goal is to learn a mapping
function from input to output, enabling the model to predict labels for new, unseen data.
Common algorithms include linear regression, logistic regression, decision trees, and neural
networks.
Unsupervised Learning: Unsupervised learning is used when the data lacks labeled output,
and the goal is to discover patterns, structures, or groupings within the data. Clustering and
dimensionality reduction are common tasks in unsupervised learning. Algorithms include k-
means clustering, hierarchical clustering, and principal component analysis (PCA).
Reinforcement Learning: Reinforcement learning is focused on training agents to make

sequences of decisions to maximize cumulative rewards in an environment. Agents learn
through trial and error, receiving feedback in the form of rewards or penalties. It's commonly
used in applications like autonomous robotics and game playing. Popular reinforcement learning
algorithms include Q-learning and deep reinforcement learning using neural networks.
Additionally, there are other subtypes and approaches within machine learning, including semi-
supervised learning (combining labeled and unlabeled data), self-supervised learning (learning
from data without explicit labels), and transfer learning (applying knowledge from one task to
another). Machine learning plays a critical role in various applications, including image and
speech recognition, natural language processing, recommendation systems, autonomous
vehicles, and many others. It has become an essential tool for extracting knowledge and making
predictions from vast and complex datasets.
Q13. Apply supervised learning algorithm:

Linear and Logistic Regression.
1. Linear Regression:
In [1]:  data <- data.frame(

X = c(34,44,22,1,23,43),
Y = c(2, 4, 5, 4, 5)
)

In [2]:  model <- lm(Y ~ X, data = data)

In [4]:  new_data <- data.frame(X = c(23,43,22))

predictions <- predict(model, newdata = new_data)
In [8]:  plot(data$X, data$Y, main = "Linear Regression", xlab = "X", ylab = "Y")
abline(model, col = "red")
2. Logistic Regression:
In [12]:  # Create a binary classification dataset

data <- data.frame(
Exam_Score = c(78, 92, 60, 84, 74, 55, 70, 68, 90, 85),
Passed = c(1, 1, 0, 1, 1, 0, 1, 0, 1, 1)
)

In [17]:  # Train a logistic regression model

model <- glm(Passed ~ Exam_Score, data = data, family = binomial(link = "lo
Warning message:
“glm.fit: algorithm did not converge”
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
In [15]:  # Make predictions

new_data <- data.frame(Exam_Score = c(65, 95, 75))
predictions <- predict(model, newdata = new_data, type = "response")
In [16]:  # Visualize the data and the logistic regression curve

plot(data$Exam_Score, data$Passed, main = "Logistic Regression", xlab = "Ex
"Probability")
lines(data$Exam_Score, predict(model, type = "response"), col = "red")
Q14. Apply supervised learning algorithm:

Decision Trees and Random Forest.
1. Decision Trees:
Decision Trees are used for classification and regression tasks. They create a tree-like model of
decisions and their possible consequences. In R, you can use the rpart package to build
decision trees.
Classification Example:
In [19]:  # Load the rpart package

library(rpart)
# Create a classification dataset
data <- data.frame(
Age = c(25, 30, 35, 40, 45, 50, 55, 60),
Outcome = c("No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes")
)
# Train a decision tree model for classification
model <- rpart(Outcome ~ Age, data = data, method = "class")
# Make predictions
new_data <- data.frame(Age = c(38, 52))
predictions <- predict(model, new_data, type = "class")
# Visualize the decision tree
library(rpart.plot)
prp(model)

Regression Example:
In [20]:  # Create a regression dataset

data <- data.frame(
X = c(1, 2, 3, 4, 5),
Y = c(2, 4, 5, 4, 5)
)
# Train a decision tree model for regression
model <- rpart(Y ~ X, data = data)
# Make predictions
new_data <- data.frame(X = c(6, 7, 8))
predictions <- predict(model, new_data)
# Visualize the decision tree
prp(model)

2. Random Forest:
Random Forest is an ensemble learning method that combines multiple decision trees to
improve accuracy and reduce overfitting. In R, you can use the randomForest package.
Classification Example:
In [23]:  # Load the randomForest package

library(randomForest)
# Create a classification dataset
data <- data.frame(
Feature1 = c(1, 2, 3, 4, 5),
Feature2 = c(2, 4, 5, 4, 5),
Class = c("A", "B", "A", "B", "A")
)
# Train a Random Forest model for classification
model <- randomForest(Class ~ Feature1 + Feature2, data = data)
# Make predictions
new_data <- data.frame(Feature1 = c(6, 7), Feature2 = c(5, 4))
# Evaluate model performance
confusion_matrix <- table(predictions, data$Class)
print(confusion_matrix)

Warning message in randomForest.default(m, y, ...):

“The response has five or fewer unique values. Are you sure you want to
do regression?”
Warning message in mean.default(y):
“argument is not numeric or logical: returning NA”
Error in y - ymean: non-numeric argument to binary operator

Traceback:
1. randomForest(Class ~ Feature1 + Feature2, data = data)

2. randomForest.formula(Class ~ Feature1 + Feature2, data = data)
3. randomForest.default(m, y, ...)
Regression Example:
In [24]:  # Create a regression dataset

data <- data.frame(
X = c(1, 2, 3, 4, 5),
Y = c(2, 4, 5, 4, 5)
)
# Train a Random Forest model for regression
model <- randomForest(Y ~ X, data = data)
# Make predictions
new_data <- data.frame(X = c(6, 7, 8))
# Visualize variable importance
importance(model)
Warning message in randomForest.default(m, y, ...):

“The response has five or fewer unique values. Are you sure you want to
do regression?”
A matrix: 1 × 1 of
type dbl
IncNodePurity
X 3.989933
Q15. Apply unsupervised learning algorithms:

K-Means Clustering and Hierarchical
Clustering.
1. K-Means Clustering:
K-Means is a partitioning method that divides a dataset into K clusters based on similarity. It
aims to minimize the sum of squared distances within each cluster.
In [17]:  import numpy as np

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Create synthetic data using make_blobs
X, _ = make_blobs(n_samples=300, centers=4, random_state=42)

# Visualize the data
plt.scatter(X[:, 0], X[:, 1], s=50, cmap='viridis')
plt.title('Synthetic Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# Create a KMeans instance with the desired number of clusters (K)
kmeans = KMeans(n_clusters=4, random_state=42)

# Fit the KMeans model to the data
kmeans.fit(X)

# Get cluster centers and labels
centers = kmeans.cluster_centers_
labels = kmeans.labels_

# Visualize the clusters and centroids
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marke
plt.title('K-Means Clustering')
plt.show()
/tmp/ipykernel_47/1198998940.py:10: UserWarning: No data for colormapping

provided via 'c'. Parameters 'cmap' will be ignored
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: F
utureWarning: The default value of `n_init` will change from 10 to 'auto'
in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
2. Hierarchical Clustering:
Hierarchical Clustering creates a tree-like structure (dendrogram) that represents the

relationships between data points. You can then cut the dendrogram at a certain level to form
clusters.
In [18]:  # Import necessary libraries

import numpy as np
from sklearn.datasets import make_blobs
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Create synthetic data using make_blobs
X, _ = make_blobs(n_samples=300, centers=4, random_state=42)

# Visualize the data
plt.title('Synthetic Data')
plt.show()

# Perform hierarchical clustering
# Using complete linkage as an example, you can choose other linkage method
# More info: https://docs.scipy.org/doc/scipy/reference/generated/scipy.clu
linkage_matrix = linkage(X, method='complete')

# Create a dendrogram
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Data Points')
plt.ylabel('Distance')
plt.show()

# Cut the dendrogram to get clusters
# The 't' parameter is the threshold for cutting the dendrogram
clusters = fcluster(linkage_matrix, t=4, criterion='maxclust')

# Visualize the clusters
plt.scatter(X[:, 0], X[:, 1], c=clusters, s=50, cmap='viridis')
plt.title('Hierarchical Clustering')
plt.show()
/tmp/ipykernel_47/1393158458.py:11: UserWarning: No data for colormapping

provided via 'c'. Parameters 'cmap' will be ignored
K-Means and Hierarchical Clustering are powerful techniques for discovering natural groupings
within data. The choice between them depends on the nature of the data and the desired
number of clusters. Experimenting with different clustering techniques and evaluating their
results is common practice in unsupervised learning.
Q16. Perform Principal Component Analysis.
Principal Component Analysis (PCA) is a dimensionality reduction technique used to

reduce the complexity of high-dimensional data while preserving important information.
It does this by transforming the original variables into a new set of uncorrelated variables
called principal components. Here's how to perform PCA in R:****
1. Load Data: Load your dataset into R. For this example, let's assume you have a dataset
named my_data with features in columns.
2. Standardize the Data: PCA is sensitive to the scale of the data, so it's a good practice to
standardize it to have zero mean and unit variance. You can use the scale() function for this.
#Standardize the data scaled_data <- scale(my_data)
3. Perform PCA: Use the prcomp() function to perform PCA on the standardized data. You can
specify the number of principal components you want to keep. #Perform PCA and keep all
principal components pca_result <- prcomp(scaled_data) #To specify the number of
components to keep, you can use: #pca_result <- prcomp(scaled_data, retx = TRUE, rank. = k)
4. Explore Results: You can access various attributes of the PCA result to explore the analysis,
including: pca_result 𝑐𝑒𝑛𝑡𝑒𝑟 : 𝑇ℎ𝑒𝑚𝑒𝑎𝑛𝑠𝑜𝑓𝑡ℎ𝑒𝑣𝑎𝑟𝑖𝑎𝑏𝑙𝑒𝑠.𝑝𝑐𝑎𝑟 𝑒𝑠𝑢𝑙𝑡 scale: The standard
deviations of the variables. pca_result&sdev: The standard deviations of the principal
components. pca_result
𝑟𝑜𝑡𝑎𝑡𝑖𝑜𝑛 : 𝑇ℎ𝑒𝑙𝑜𝑎𝑑𝑖𝑛𝑔𝑠(𝑐𝑜𝑒𝑓𝑓𝑖𝑐𝑖𝑒𝑛𝑡𝑠)𝑜𝑓𝑡ℎ𝑒𝑣𝑎𝑟𝑖𝑎𝑏𝑙𝑒𝑠𝑜𝑛𝑡ℎ𝑒𝑝𝑟𝑖𝑛𝑐𝑖𝑝𝑎𝑙𝑐𝑜𝑚𝑝𝑜𝑛𝑒𝑛𝑡𝑠.𝑝𝑐𝑎𝑟 𝑒𝑠𝑢𝑙
x: The transformed data in the principal component space.
5. Visualize the Results: Visualize the explained variance by each principal component. You
can create a scree plot to understand how many components are needed to capture most of the
variance. #Create a scree plot screeplot(pca_result)
6. Interpret the Principal Components: You can interpret the principal components based on
the loadings of the original variables on each component. Positive or negative loadings indicate
the direction and strength of the variables' influence on the principal components.
7. Choose the Number of Components: Based on the scree plot and the amount of variance
explained, decide how many principal components to retain for your analysis.
8. Transform Data with Selected Components: Use the predict() function to transform your
data into the space of the selected principal components. #Keep, for example, the first two
principal components selected_components <- 2 reduced_data <- predict(pca_result, newdata =
scaled_data)[, 1:selected_components]
In [19]:  # Import necessary libraries

import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

# Load the Iris dataset as an example
iris = load_iris()
X = iris.data
y = iris.target

# Perform PCA
pca = PCA(n_components=2) # Specify the number of components to retain
X_pca = pca.fit_transform(X)

# Visualize the data in the reduced dimensional space
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgeco
plt.colorbar(scatter, label='Class')
plt.title('PCA of Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Print the explained variance ratio
print("Explained Variance Ratio:")
print(pca.explained_variance_ratio_)

Explained Variance Ratio:

[0.92461872 0.05306648]
Q17. Perform Time Series Analysis in R.

Time series analysis is a crucial technique for analyzing and modeling data that varies over
time, such as stock prices, temperature records, or sales data. In R, you can perform time series
analysis using various packages, but one of the most commonly used packages is stats for
basic time series analysis and the forecast package for more advanced forecasting tasks.
Here's a step-by-step guide on performing time series analysis in R:
1. Load the Required Packages: #Load the necessary packages library(stats) library(forecast)
2. Create or Load Time Series Data: You can create a time series object in R using the ts()
function or load time series data from a file. Ensure that your data has a timestamp or time
index. #Create a time series object (e.g., monthly data from 2020 to 2021) ts_data <- ts(c(10,
15, 20, 25, 30, 35), start = c(2020, 1), frequency = 12) #Load time series data from a file (e.g.,
CSV) #ts_data <- read.csv("your_time_series_data.csv")
3. Visualize the Time Series: To understand your data better, it's essential to plot the time
series. #Plot the time series plot(ts_data, main = "Time Series Data", xlab = "Year", ylab =
"Value")
4. Decompose the Time Series: Decomposing a time series helps to separate it into its
constituent components, such as trend, seasonality, and noise. #Decompose the time series
decomposed <- decompose(ts_data) plot(decomposed)
5. Perform Basic Time Series Analysis: Use functions like acf() (autocorrelation function) and
pacf() (partial autocorrelation function) to understand the autocorrelation in your data.
#Autocorrelation and partial autocorrelation plots acf(ts_data) pacf(ts_data)
6. Build Time Series Models: You can use various models like ARIMA (AutoRegressive
Integrated Moving Average) or Exponential Smoothing for forecasting time series data. #Fit an
ARIMA model arima_model <- auto.arima(ts_data)
7. Make Forecasts: Use your time series model to make future forecasts. #Make forecasts
forecast_values <- forecast(arima_model, h = 12) # Forecast for the next 12 time periods
plot(forecast_values, main = "Time Series Forecast")
8. Evaluate the Forecast: You can evaluate the accuracy of your forecasts using metrics like
Mean Absolute Error (MAE) or Mean Squared Error (MSE). #Evaluate the forecast
accuracy(forecast_values)
9. Visualize the Forecast: Plot the original time series data along with the forecasted values.
#Plot the original time series and forecast plot(ts_data, main = "Time Series Data and Forecast",
xlab = "Year", ylab = "Value") lines(forecast_values$mean, col = "blue") legend("topleft", legend
= "Forecast", col = "blue")
In [1]:  # Load necessary libraries

library(ggplot2)
library(forecast)

# Generate a synthetic time series with seasonality
set.seed(123)
time <- seq(from = as.Date("2020-01-01"), by = "months", length.out = 36)
value <- sin(seq(1, 2*pi, length.out = 36)) + rnorm(36, mean = 0, sd = 0.3)
ts_data <- ts(value, start = c(2020, 1), frequency = 12)

# Plot the synthetic time series
autoplot(ts_data) +
labs(title="Synthetic Time Series", x="Time", y="Value")

# Check for seasonality and trend
ggseasonplot(ts_data, year.labels=TRUE, year.labels.left=TRUE)

# Decompose the time series into components (trend, seasonal, remainder)
decomposed_ts <- decompose(ts_data)

# Plot the decomposed components
autoplot(decomposed_ts) +
labs(title="Decomposition of Synthetic Time Series")

# Fit a simple time series model (ARIMA) for forecasting
ts_model <- auto.arima(ts_data)
ts_forecast <- forecast(ts_model, h=12) # Forecasting for the next 12 time

# Plot the original time series and the forecast
autoplot(ts_forecast) +
labs(title="Synthetic Time Series Forecast", x="Time", y="Value")

Registered S3 method overwritten by 'quantmod':

method from
as.zoo.data.frame zoo
Q18. Manipulate text data in R for Sentiment

Analysis.
Text data manipulation is an essential step in preparing data for sentiment analysis in R. In
sentiment analysis, you typically need to clean and preprocess the text data to make it suitable
for analysis. Here are the key steps involved in manipulating text data for sentiment analysis in
R:
1. Load Required Libraries: First, you need to load the necessary libraries for text data
manipulation and sentiment analysis. Commonly used packages include tm (Text Mining),
stringr, and tidytext. library(tm) library(stringr) library(tidytext)
2. Load and Prepare Text Data: Load your text data, which could be in a CSV file, a data
frame, or a text corpus. Ensure that your data contains a column with the text you want to
analyze. #Load your text data (replace 'your_data.csv' with your data source) text_data <-
read.csv("your_data.csv") #Create a text corpus corpus <-
Corpus(VectorSource(text_data$your_text_column))
3. Data Cleaning: Text data is often messy, so you need to clean it by removing special
characters, numbers, and other unwanted elements. You can also convert the text to lowercase.
#Clean the text data corpus <- tm_map(corpus, content_transformer(tolower)) corpus <-
tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, removeNumbers) corpus <-
tm_map(corpus, removeWords, stopwords("en")) corpus <- tm_map(corpus, stripWhitespace)
4. Tokenization: Tokenization is the process of splitting text into individual words or tokens,
making it suitable for analysis. #Tokenize the text corpus <- tm_map(corpus, wordTokenize)
5. Sentiment Analysis: You can use sentiment lexicons or pre-trained models to perform
sentiment analysis. For example, the tidytext package provides a sentiment lexicon, and you
can use it to determine the sentiment of each word in the text. #Perform sentiment analysis
using the tidytext package library(tidytext) library(dplyr) #Load the sentiment lexicon data("nrc")
#Transform the text data into a format suitable for sentiment analysis text_sentiment <- corpus
%>% unnest_tokens(word, text) %>% inner_join(get_sentiments("nrc")) #Summarize sentiment
by text element (e.g., document, sentence, etc.) sentiment_summary <- text_sentiment %>%
group_by(document, sentiment) %>% summarise(sentiment_count = n()) %>%
pivot_wider(names_from = sentiment, values_from = sentiment_count, values_fill = 0)
6. Analyze Sentiment: You can now analyze the sentiment of the text data by aggregating and
summarizing the sentiment scores. #Analyze sentiment head(sentiment_summary) This will give
you a summary of sentiment scores for each document or text element.
7. Interpret Results: Based on the sentiment scores, you can interpret whether the text is
generally positive, negative, or neutral. These are the fundamental steps for manipulating text
data for sentiment analysis in R. Depending on the complexity of your analysis, you may need to
explore additional text preprocessing and sentiment analysis techniques, such as custom
lexicons or machine learning models for sentiment classification.
Q19. Implement topic modelling in R.

Topic modeling is a technique used to discover topics or themes within a collection of
documents. In R, the tm and topicmodels packages are commonly used for topic modeling.
Here's a step-by-step guide on how to implement topic modeling in R:
1. Load the Required Libraries: First, load the necessary libraries for text preprocessing and
topic modeling. library(tm) library(topicmodels)
2. Prepare and Preprocess Text Data: Load your text data and preprocess it, similar to the
steps for sentiment analysis. Cleaning, tokenization, and creating a Document-Term Matrix
(DTM) are crucial. #Load your text data (replace 'your_data.csv' with your data source)
text_data <- read.csv("your_data.csv") #Create a text corpus corpus <-
Corpus(VectorSource(text_data$your_text_column)) #Clean the text data corpus <-
tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers) corpus <- tm_map(corpus, removeWords,
stopwords("en")) corpus <- tm_map(corpus, stripWhitespace) #Tokenize the text corpus <-
tm_map(corpus, wordTokenize) #Create a Document-Term Matrix (DTM) dtm <-
DocumentTermMatrix(corpus)
3. Build the Topic Model: Now, you can build a topic model using the LDA() function from the
topicmodels package. Specify the number of topics (k) you want to discover. #Build the topic
model k <- 5 # Number of topics lda_model <- LDA(dtm, k = k)
4. Explore Topics: You can explore the topics and associated words using the terms() function.
This will give you a list of words for each topic. #Explore topics terms(lda_model, 5) # Show the
top 5 words for each topic
5. Assign Topics to Documents: You can assign topics to documents in your dataset using the
tm package's tm_map() function. #Assign topics to documents topic_assignments <-
as.data.frame(topics(lda_model)) text_data_with_topics <- cbind(text_data, topic_assignments)
6. Interpret Topics: Inspect the top words in each topic to interpret what each topic represents.
This will help you label the topics based on the words associated with them.
7. Visualize Topics: You can visualize the topics and their relationships using various
visualization techniques, including word clouds, bar plots, or network graphs. #Visualize topics
using word clouds library(wordcloud) wordcloud(terms(lda_model, 10)) Topic modeling is a
valuable technique for discovering latent themes or topics in text data. The choice of the number
of topics (k) is a crucial decision and might require experimentation. Additionally, topic modeling
can be further enhanced with more advanced techniques, such as using other topic modeling
algorithms or performing sentiment analysis within each topic to gain deeper insights.
In [5]:  # Install and load necessary packages

install.packages(c("tm", "topicmodels", "tidytext", "dplyr"))

library(tm)
library(topicmodels)
library(tidytext)
library(dplyr)

# Create a synthetic dataset
set.seed(123)
docs <- c(
"Machine learning algorithms are used for data analysis.",
"Natural language processing helps computers understand human languages."
"Topic modeling is an interesting area of research in artificial intellig
"The LDA algorithm is commonly used for topic modeling in text mining.",
"Text mining involves extracting useful information from large amounts of
)

# Create a corpus
corpus <- Corpus(VectorSource(docs))

# Preprocess the corpus
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)

# Create a document-term matrix
dtm <- DocumentTermMatrix(corpus)

# Convert the document-term matrix to a matrix
mat <- as.matrix(dtm)

# Create an LDA model
lda_model <- LDA(mat, k = 2) # Specify the number of topics (k)

# Display the terms associated with each topic
terms(lda_model)

# Visualize the topics
topics <- tidy(lda_model, matrix = "beta")
top_terms <- topics %>%
group_by(topic) %>%
top_n(5, beta) %>%
arrange(topic, -beta)

print(top_terms)

Installing packages into ‘/usr/local/lib/R/site-library’

(as ‘lib’ is unspecified)
Warning message:
“unable to access index for repository http://cran.rstudio.com/src/contri
b: (http://cran.rstudio.com/src/contrib:)
cannot open URL 'http://cran.rstudio.com/src/contrib/PACKAGES'”
Warning message:
“packages ‘tm’, ‘topicmodels’, ‘tidytext’, ‘dplyr’ are not available for
this version of R
Versions of these packages for your version of R might be available elsew

here,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-
packages” (https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#
Installing-packages”)
Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
The following object is masked from ‘package:httr’:
content
Warning message in tm_map.SimpleCorpus(corpus, content_transformer(tolowe

r)):
“transformation drops documents”
Warning message in tm_map.SimpleCorpus(corpus, removePunctuation):
Warning message in tm_map.SimpleCorpus(corpus, removeNumbers):
Warning message in tm_map.SimpleCorpus(corpus, removeWords, stopwords("en
glish")):
Warning message in tm_map.SimpleCorpus(corpus, stripWhitespace):
Topic 1
'algorithms'
Topic 2
'text'
# A tibble: 34 × 3
# Groups: topic [2]
topic term beta
<int> <chr> <dbl>
1 1 algorithms 0.0714
2 1 analysis 0.0714
3 1 data 0.0714
4 1 learning 0.0714
5 1 machine 0.0714
6 1 used 0.0714
7 1 computers 0.0714
8 1 helps 0.0714
9 1 human 0.0714
10 1 language 0.0714
# ℹ 24 more rows
Q20. Review and final project using a

combination of the techniques learned
A final project that combines various data analysis and machine learning techniques can be an
excellent way to apply what you've learned in R. Such a project can be both challenging and
rewarding. Here's a sample project idea that combines multiple techniques: Project Idea:
Predicting Customer Churn in a Telecom Company In this project, you'll work on a simulated
dataset from a telecom company to predict customer churn. You'll combine data preprocessing,
exploratory data analysis, feature engineering, machine learning, and evaluation techniques.
Here's an outline of the project:
1. Data Collection: Start by obtaining the dataset. You can simulate customer data with
features such as customer demographics, usage patterns, contract details, and customer churn
status (whether they churned or not).
2. Data Preprocessing: Clean the data by handling missing values and outliers. Perform
feature scaling if necessary. Encode categorical variables. Split the data into training and testing
sets.
3. Exploratory Data Analysis (EDA): Conduct EDA to understand the relationships between
different features and the target variable (churn). Visualize the data using various plots and
charts to gain insights.
4. Feature Engineering: Create new features or modify existing ones that may be useful for
predicting customer churn. Extract relevant information from features like contract length and
usage patterns.
5. Machine Learning: Select machine learning algorithms suitable for the classification task
(e.g., logistic regression, decision trees, random forests, or gradient boosting). Train and
evaluate multiple models using cross-validation. Tune hyperparameters to optimize model
performance. Consider ensembling techniques if necessary.
6. Model Evaluation: Evaluate model performance using metrics such as accuracy, precision,
recall, F1-score, and ROC AUC. Create a confusion matrix and visualize it. Consider plotting the
ROC curve and Precision-Recall curve.
7. Interpretation: Interpret the model results to understand which features are most influential
in predicting churn. Identify actionable insights that the telecom company can use to reduce
customer churn.
8. Report and Presentation: Create a report or presentation summarizing the project, including
data preprocessing, EDA, modeling, and results. Clearly explain the methodology and key
findings. Present the predictive model's performance and its implications for the telecom
company.
9. Future Recommendations: Provide recommendations for the company based on the

analysis. Suggest strategies to reduce customer churn.
10 Code and Documentation: Ensure that your code is well documented organized and

Da Lab File

Uploaded by

Copyright:

Available Formats

You might also like

Da Lab File

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Da Lab File

Uploaded by

Copyright:

Available Formats

11/21/23, 5:30 PM notebook71ba53d7b2 - Jupyter Notebook

Q1 and Q2. Basic Operations, Variable Creation

Q3. Creating various Data Structures

In [ ]:  # Creating a data frame

Q4. Install and Load R packages

Checking Installed packages

Q5. Data Manipulation using dplyr

In [ ]:  # Select specific columns

Q6. Clean Data Using Tidying

In [ ]:  # Creating a messy data frame

Type Markdown and LaTeX: 𝛼2

Q7. Create different types of plots using

In [ ]:  # Create a line plot

In [ ]:  # Create a bar plot

In [ ]:  # Create a density plot

In [ ]:  # Create a violin plot

Q8. Perform descriptive statistics and

In [ ]:  # Summary statistics for a numeric variable

In [ ]:  # Chi-square test for independence

Q9&Q10. Import and Export data from csv file

In [ ]:  # Count missing values in each column of the dataset

Removing Missing Values

In [ ]:  # Removing rows with any NA values

In [ ]:  # Imputing missing values with mean

In [ ]:  # Converting data from wide to long format

In [ ]:  # Filtering based on multiple conditions

Using dplyr for complex operations

In [ ]:  # Example chaining operations to filter, mutate, and arrange data

Q11. Create a report with R Markdown.

title: "Sample R Markdown Report" output: html_document: toc: true

```{r} #Load data data <- read.csv("data.csv") head(data)

Data Summary #Summary statistics summary(data)

Data Visualization #Create a scatter plot library(ggplot2) ggplot(data, aes(x = X, y = Y)) +

Q12. Understand the concept of machine

Key Concepts in Machine Learning:

Types of Machine Learning:

Reinforcement Learning: Reinforcement learning is focused on training agents to make

Q13. Apply supervised learning algorithm:

In [1]:  data <- data.frame(

In [2]:  model <- lm(Y ~ X, data = data)

In [4]:  new_data <- data.frame(X = c(23,43,22))

In [12]:  # Create a binary classification dataset

In [17]:  # Train a logistic regression model

In [15]:  # Make predictions

In [16]:  # Visualize the data and the logistic regression curve

Q14. Apply supervised learning algorithm:

In [19]:  # Load the rpart package

In [20]:  # Create a regression dataset

In [23]:  # Load the randomForest package

Warning message in randomForest.default(m, y, ...):

Error in y - ymean: non-numeric argument to binary operator

1. randomForest(Class ~ Feature1 + Feature2, data = data)

In [24]:  # Create a regression dataset

Warning message in randomForest.default(m, y, ...):

Q15. Apply unsupervised learning algorithms:

In [17]:  import numpy as np

/tmp/ipykernel_47/1198998940.py:10: UserWarning: No data for colormapping