HRM

grades=c("O","B+","A","A+","C","B","A","B+","A","O+")
grades
marks=c(86,78,45,90,59,78,87,44,98,60)
grades
rno=c(1:10)
rno
marks=marks+2
marks
slab=marks/10
slab
grades[3]
slab[c(3,5)]
marks[8:10]
gf=as.factor(grades)
gf
as.numeric(gf)
rno=c(rno,11)
rno
marks=c(marks,NA)
marks
grades=c(grades,NA)
grades
anyNA(marks)
mean(marks)
mean(marks,na.rm = TRUE)
d=data.frame(rno,grades,marks)
d
head(d)
tail(d)
grades[5]
x=5
x
x<-5
x
x=y=5
x
y=5L
class(y)
class(x)
x+y
x-y
x==y
x=c(1:10)
x
y=c(-5:4)
y
z=c("hockey","football","basketball","cricket","tennis","hockey","football","basketball","cricket","tennis")
z
unique=as.factor(z)
unique
nchar(z)
l=data.frame(x,y,z)
l
class(l)
mtcars
x=c(1:5)
x
y=c(-2:2)
y
data.frame(x,y)
x+y
x-y
x*y
x/y
x^2
sqrt(x)
sqrt(y)
date1=as.Date("2019-11-15")
date1
dim(l)
str(l)
x=c(1,2,NA,3,4,NA)
mean(x)
mean(x,na.rm = TRUE)
TRUE*10
FALSE*5
mtcars
dim(mtcars)
row.names(mtcars)
colnames(mtcars)
mtcars[,4]
mtcars[2,]
mtcars[8,4]
head(mtcars)
tail(mtcars)
mtcars[3,4:5]
mean(mtcars$mpg)
summary(mtcars)
View(mtcars)
mtcars
mtcars %>% select(disp,mpg,cyl) %>% filter(mpg<22,cyl==4) %>% arrange(disp) %>% mutate(mean(disp))
mtcars
View(mtcars)
data(mtcars)
mtcars
sum(mtcars$mpg>=21)
sum(mtcars$mpg==21)
mtcars %>% select(mpg,cyl) %>% filter(mpg>=21)
library(dplyr)
mtcars %>% select(am,mpg) %>% filter(mpg>24,am==1)
airquality
View(airquality)
anyNA(airquality)
mean(airquality$Ozone)
mean(airquality$Ozone,na.rm = TRUE)
median(airquality$Solar.R,na.rm = TRUE)
airquality[4,4]
airquality %>% select(Temp,Day) %>% filter(Day==4)
airquality %>% select(Wind,Day) %>% filter(Day==11)
airquality %>% select(Temp,Day,Solar.R) %>% filter(Day==4,Solar.R>=313)
airquality %>% select(Temp,Day,Month) %>% filter(Day==4,Month==5)
# arrange value of ozone in descending order and give details of temperature in 6th month
airquality %>% select(Temp,Month,Ozone) %>% filter(Month==6) %>% arrange(desc(Ozone))
# give value of ozone on the day when wind is 8 and temperature is 72

airquality %>% select(Ozone,Day,Wind,Temp) %>% filter(Wind==8,Temp==72)
# mutate is for adding new variable

airquality %>% mutate(Ratio=Temp/Wind)
air=airquality
air
anyNA(air)
head(air)
# replace missing value of NA

replace_na(air$Ozone,replace = mean(air$Ozone,na.rm = TRUE))
hflights
hf=hflights
# replace, groupby, summarise

View(hf)
k=hf %>% group_by(UniqueCarrier) %>% filter(Distance>3000)
k
View(k)
#where taxing took longer than flight

View(hf %>% select(TaxiIn,TaxiOut,AirTime) %>% filter(TaxiIn+TaxiOut>AirTime))
#all flights departed late but arrived ahead of schedue

View(hf %>% group_by(UniqueCarrier) %>% filter(DepDelay>0,ArrDelay<0))
#all flights that departed before 5 and arrived after 10

View(hf %>% filter(DepTime<1700,ArrTime>2200))
#all cancelled flights on weekend

hf %>% filter(DepTime)
# data frame
player=c("sachin","sehwag","virat","dhoni","yuvraj","raina","bumrah","shami","ashwin","bhuvneshwar")
player
score=c(99,209,183,156,143,56,33,10,5,16)
score
matchesplayed=c(234,209,123,235,123,108,45,56,34,123)
matchesplayed
dismissals=c(134,180,98,189,100,98,40,NA,32,NA)
dismissals
tournament=c("ICC WC","ICC WC","ICC WC","ICC WC","ICC WC","ICC WC","ICC WC","ICC WC","ICC WC","ICC WC")
tournament
l=data.frame(player,score,matchesplayed,dismissals,tournament)
l
# value in 3rd column and 5th row

l[5,3]
# no. of rows and column
dim(l)
# number of rows
# head command 7th row

head(l,7)
# apply select and filter player having more than 150 runs
l %>% select(player,score) %>% filter(score>150)
# info in 4th column

l[,4]
# replace NA with median

g=replace_na(dismissals,replace = median(dismissals,na.rm = TRUE))
g
# DISTRIBUTION DIAGRAMS
diamonds
# create a histogram by putting carat on x-axis and price on y-axis
hist(diamonds$carat,xlab = "carat",ylab = "price",main = "chandan")
# scatterplot
plot(diamonds$carat,diamonds$price)
# boxplot
boxplot(diamonds$carat,diamonds$price)
summary(diamonds$carat)
# ggplot2
ggplot(diamonds,aes(x=carat))+geom_histogram()+facet_wrap(~color)
# mtcars prepare histogram for a and m by taking mpg as xlab and cyl as ylab
mtcars
hist(mtcars$mpg,xlab = "mpg",ylab = "cyl",main = "chandan")
ggplot(mtcars,aes(x=mpg))+geom_histogram()+facet_wrap(~am)
# for two variables

ggplot(diamonds,aes(x=carat))+geom_histogram()+facet_grid(cut~color)
ggplot(mtcars,aes(x=mpg))+geom_histogram()+facet_grid(cyl~am)
# binwidth is for graph width, geom_ponit is for scatter points alpha is for fading of dots, geom_violin is for violin
shaped graph
iris
head(iris)
ggplot(iris,aes(x=Sepal.Length))+geom_histogram(binwidth = .5, fill="blue")
ggplot(iris,aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point(alpha=1)+geom_boxplot()
+geom_violin(alpha=0.2)+facet_wrap(~Species)+ggtitle("Flower Data")
# geom_density is DENSITY DIAGRAM

diamonds
ggplot(diamonds,aes(x=carat))+geom_density(aes(fill="red"))
ggplot(diamonds,aes(x=carat, y=price))+geom_smooth()
# REGRESSION, lm=Linear Model
z=lm(price~carat, data=diamonds)
z
summary(z)
# LINE DIAGRAM (DEVIATION)

economics
ggplot(economics,aes(x=date, y=pop, color="red"))+geom_line()
# PIE CHART, coord_polar is for pie chart otherwise geom_bar is for bar chart
G1=LETTERS[1:5]
G1
V=c(33,45,65,43,87)
V
D=data.frame(G1,V)
D
ggplot(D, aes(x="", y=V, fill=G1))+geom_bar(stat = "identity", width = 1)+coord_polar("y", start = 0)+ggtitle("PIECHART")
mtcars
# CORELATION
cor(mtcars)
View(cor(mtcars))
# ROUNDING OFF DECIMAL VALUES

q=round(cor(mtcars),2)
ggcorrplot(q,method = "circle")
ggcorrplot(q,method = "square")
ggcorrplot(q,method = "square", type = "lower")
ggcorrplot(q,method = "square", type = "upper")
corrplot.mixed(q,upper = "square", lower = "number")
diamonds
d=diamonds
z=d[,5:10]
b=cor(z)
b
corrplot(b,method = "square", type = "lower")
corrplot.mixed(b,upper = "ellipse", lower = "number")
mtcars
# CORELATION
cor(mtcars)
View(cor(mtcars))
# ROUNDING OFF DECIMAL VALUES

q=round(cor(mtcars),2)
ggcorrplot(q,method = "circle")
ggcorrplot(q,method = "square")
ggcorrplot(q,method = "square", type = "lower")
ggcorrplot(q,method = "square", type = "upper")
corrplot.mixed(q,upper = "square", lower = "number")
diamonds
d=diamonds
z=d[,5:10]
b=cor(z)
b
corrplot(b,method = "square", type = "lower")
corrplot.mixed(b,upper = "ellipse", lower = "number")
# FORECAST
austres
dim(austres)
# IMPORTING DATASET
read.csv(file.choose(), header = TRUE, stringsAsFactors = F)
rainfall=c(234,678,234,243,567,345,242,890,356,234,567,234)
rainfall2=c(789,678,453,346,987,341,890,427,341,543,987,341)
dim(rainfall)
str(rainfall)
cominedrainfall=matrix(c(rainfall,rainfall2),nrow=12)
cominedrainfall
#command for timeseries

timeseries.object.name=ts(data,start,end,frequency)
combinedrainfall.timeseries=ts(cominedrainfall,start = c(2012,1),frequency = 12)
combinedrainfall.timeseries
AirPassengers
Airp=AirPassengers
Airp
Air.timeseries=ts(Airp,start = c(1949,1),end = c(1960,1),frequency = 12)
Air.timeseries
start(Air.timeseries)
end(Air.timeseries)
frequency(AirPassengers)
arimafit=auto.arima(Airp)
arimafit
predictingfuture=forecast(arimafit, h=60)
plot(predictingfuture)
predictingfuture2=predict(arimafit, n.ahead = 60)

predictingfuture2
USAccDeaths
usd=USAccDeaths
usd
start(usd)
end(usd)
frequency(usd)
US=ts(usd, start = c(1973,1), end = c(1978,12), frequency = 12)
US
usdeath=auto.arima(usd)
usdeath
forcast=forecast(usdeath, h=48)
forcast
plot(forcast)
# SLICE COMMAND
PlantGrowth
P=PlantGrowth
P
P %>% slice(c(1:5),8,15:20)
# for making perfect data (removing non stationary of data) using log command, diff is used to remove time
dependencies
Airp
plot(Airp)
plot(log(Airp))
plot(diff(log(Airp)))
fit=auto.arima(log(Airp), approximation = F, trace = F)
fit
prediction=predict(fit, n.ahead = 36)
prediction
# using antilog
newpred=round(2.718^prediction$pred,0)
newpred
ts.plot(Airp, newpred, col=c("blue","green"))
# linear model
USArrests
str(USArrests)
ggplot(USArrests, aes(x=Rape, y=Assault))+geom_smooth(method = "lm")+geom_point()
lm(Rape~Murder+Assault+UrbanPop, data = USArrests)
# alternate of above command ###### ~. this command selects all variables

lm(Rape~.,data = USArrests)
## step 1 : data pre-processing

diabetes=read.csv(file.choose())
diabetes
str(diabetes)
dim(diabetes)
set.seed(12345)
## step 2 : data preparation

anyNA(diabetes)
## step 3 : data partitioning

intrainnew=createDataPartition(diabetes$Outcome, p=0.75, list = F)
training=diabetes[intrainnew,]
training
testing=diabetes[-intrainnew,]
testing
dim(training)
dim(testing)
## step 4 : model building
model=train(as.factor(Outcome)~.,data = training, method="glm")
model
## step 5 : data prediction

Pred=predict(model, newdata=testing)
Pred
confusionMatrix(Pred, as.factor(testing$Outcome) )
##################################################################################################
#######################################################################################
library(kernlab);data("spam")
spam
str(spam)
dim(spam)
set.seed(23456)
intrain=createDataPartition(y=spam$type, p=0.75, list = F)
intrain
training1=spam[intrain,]
training1
testing1=spam[-intrain,]
testing1
dim(testing1)
dim(training1)
model1=train(type~.,data = training1, method="knn")
model1
pred=predict(model1, newdata=testing1)
pred
confusionMatrix(pred, testing1$type)
# UNSUPERVISED LEARNING
ir=iris
ir
ir=iris[,-5]
ir
ggplot(iris,aes(x=Sepal.Length, y=Petal.Length, color=Species))+geom_point(alpha=1)
## distribute into 3 clusters

New=kmeans(dist(ir),3)
New
New$cluster
## show as per the variable "cluster"

iris1=ir %>% mutate(clusters=New$cluster)
iris1
## hierarichal clustering ## dendogram plot

h1=hclust(dist(iris1), method = "ward.D")
plot(h1)
## using method "complete

h1=hclust(dist(iris1), method = "complete")
plot(h1)
intel=readLines(file.choose())
intel
articleintel=Corpus(VectorSource(intel))
articleintel
articleintel=tm_map(articleintel,removeNumbers)
articleintel=tm_map(articleintel,removePunctuation)
stopwords()
articleintel=tm_map(articleintel,removeWords,stopwords())
articleintel=tm_map(articleintel,removeWords,c("its","will","has","the"))
wordcloud(articleintel,random.order = F)
## create TDM
tdm=TermDocumentMatrix(articleintel)
tdm
class(tdm)
tdm=as.matrix(tdm)
tdm
View(tdm)
termfreq=rowSums(as.matrix(tdm))
termfreq
View(termfreq)
## subsetting TDM
termfreqsubset=subset(termfreq, termfreq>=2)
termfreqsubset
View(termfreqsubset)
## creating a dataframe
tdmf=data.frame(term=names(termfreqsubset),freq=termfreqsubset)
tdmf
row.names(tdmf)=NULL
View(tdmf)
tdmplot=ggplot(tdmf, aes(x=term, y=freq))+geom_bar(stat = "identity")+xlab("Terms")+ylab("Count")+coord_flip()
+theme(axis.text = element_text(size = 6))
tdmplot
## SENTIMENT ANALYSIS
class(articleintel)
a=as.character(articleintel)
class(a)
mysentiment=get_nrc_sentiment(a)
mysentiment
sentimentscores=data.frame(colSums(mysentiment[,]))
sentimentscores
## giving name to score column

names(sentimentscores)="Score"
sentimentscores
## giving row names

sentimentscores= cbind("sentiment" = rownames(sentimentscores), sentimentscores)
sentimentscores
## removing row names
rownames(sentimentscores)=NULL
sentimentscores
## plotting the sentiment scores

ggplot(sentimentscores, aes(x = sentiment, y = Score))+geom_bar(aes(fill=sentiment), stat = "identity")
+theme(legend.position = "none") + xlab("sentiment") + ylab("Score") + ggtitle("Total Sentiment Score")
## TOPIC MODELING
## Latent Dirichlet Allocation (LDA) models are a widely used topic modeling technique
## create DTM
intel = list.files(getwd(), pattern = "*.txt")

intel
files=lapply(intel,readLines)
files
articleintel=Corpus(VectorSource(files))
articleintel
articleintel=tm_map(articleintel,removeNumbers)
articleintel=tm_map(articleintel,removePunctuation)
stopwords()
articleintel=tm_map(articleintel,removeWords,stopwords())
articleintel=tm_map(articleintel,removeWords,c("its","will","has","the"))
wordcloud(articleintel,random.order = F)
articledtm = DocumentTermMatrix(articleintel, control = list(minWordLength = 3))

articledtm
k=3
SEED = 1234
article.lda=LDA(articledtm, k, method = "Gibbs", control = list(seed = SEED))
article.lda
lda.topics = as.matrix(topics(article.lda))
lda.topics
lda.terms=terms(article.lda)
lda.terms

HRM

Uploaded by

Copyright:

Available Formats

You might also like

HRM

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

HRM

Uploaded by

Copyright:

Available Formats

grades=c("O","B+","A","A+","C","B","A","B+","A","O+")

# give value of ozone on the day when wind is 8 and temperature is 72

# mutate is for adding new variable

# replace missing value of NA

# replace, groupby, summarise

#where taxing took longer than flight

#all flights departed late but arrived ahead of schedue

#all flights that departed before 5 and arrived after 10

#all cancelled flights on weekend

# value in 3rd column and 5th row

# head command 7th row

# info in 4th column

# replace NA with median

# for two variables

# geom_density is DENSITY DIAGRAM

# LINE DIAGRAM (DEVIATION)

# ROUNDING OFF DECIMAL VALUES

# ROUNDING OFF DECIMAL VALUES

#command for timeseries

predictingfuture2=predict(arimafit, n.ahead = 60)

# alternate of above command ###### ~. this command selects all variables

## step 1 : data pre-processing

## step 2 : data preparation

## step 3 : data partitioning

## step 5 : data prediction

## distribute into 3 clusters

## show as per the variable "cluster"

## hierarichal clustering ## dendogram plot

## using method "complete

## giving name to score column

## giving row names

## plotting the sentiment scores

intel = list.files(getwd(), pattern = "*.txt")

articledtm = DocumentTermMatrix(articleintel, control = list(minWordLength = 3))

You might also like