Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 2

housetrain=read.csv(file.

choose(),stringsAsFactors = F)
housetest=read.csv(file.choose(),stringsAsFactors = F)
# FOR rbind THE NUM OF COLUMNS IN BOTH DATAFRAMES MUST BE SAME
houseall=rbind(housetrain[-81],housetest)
# Dealing with levels mismatch in train & test data
for (f in 1:length(names(houseall))){
levels(housetrain[,f])=levels(houseall[,f])
}
sort(colSums(is.na(houseall)),decreasing = T)
# IMPUTE ONLY IF THERE IS MORE THAN 50% DATA AVAILABLE IN MISSING
COLUMN
# IF LESS THAN 50% EITHER DELETE COLUMN OR IMPUTE AS MISSING OR
NONE
# PoolQC - missing is 2909 - total observations - 2919
2909/2919 # 99.65% data is missing
# SINCE THE VARIABLE IS SIGNIFICANT, CANNOT DELETE HENCE IMPUTE AS
'NONE'
# WHAT ABOUR MISCFEATURE, ALLEY,FENCE?
table(houseall$PoolQC)
houseall$PoolQC[is.na(houseall$PoolQC)]='None'
houseall$MiscFeature[is.na(houseall$MiscFeature)]='None'
houseall$Alley[is.na(houseall$Alley)]='None'
houseall$Fence[is.na(houseall$Fence)]='None'
sort(colSums(is.na(houseall)),decreasing = T)
table(houseall$FireplaceQu)
houseall$FireplaceQu[is.na(houseall$FireplaceQu)]='None'
# FOR MISSING IMPUTATION SPLIT DATA INTO NUMERIC & CATEGORICAL
# NUMERIC - MEAN OR MEDIAN IMPUTATION
# CATEGORICAL - MODE IMPUTATION OR 'NONE' IMPUTATION
houseallnumeric=houseall[sapply(houseall,is.numeric)]
houseallcategory=houseall[sapply(houseall,is.character)]
sort(colSums(is.na(houseallcategory)),decreasing=T)
lapply(houseallcategory,table)
houseallcategoryimp=sapply(houseallcategory,
function(x) ifelse(is.na(x),
which.max(table(x)),x))
sort(colSums(is.na(houseallcategoryimp)),decreasing = T)
str(houseallnumeric)
sort(colSums(is.na(houseallnumeric)),decreasing = T)
datecols=c('GarageYrBlt','YearBuilt','YearRemodAdd','MoSold',
'YrSold')
housealldate=houseall[datecols]
houseallnum1=houseallnumeric[,!colnames(houseallnumeric)
%in% datecols]
# numeric cols - 32 datecols -5 categorycols -43 Totalcols-80
sort(colSums(is.na(housealldate)))
table(housealldate$GarageYrBlt)
housealldate$GarageYrBlt[is.na(housealldate$GarageYrBlt)]=2005
sort(colSums(is.na(houseallnum1)),decreasing = T)
houseallnum1$BsmtFullBath[is.na(houseallnum1$BsmtFullBath)]=mean(houseallnu
m1$BsmtFullBath,na.rm=T)
houseallnumimp=sapply(houseallnum1,
function(x) ifelse(is.na(x),
mean(x,na.rm=T),x))
sort(colSums(is.na(houseallnumimp)),decreasing = T)
houseallimputed=data.frame(houseallcategoryimp,housealldate,
houseallnumimp)
housetrainimp=houseallimputed[1:1460,]
housetestimp=houseallimputed[1461:2919,]
housetrainfinal=data.frame(housetrainimp,
SalePrice=housetrain$SalePrice)
housereg=lm(SalePrice~.,data=housetrainfinal[-49])
summary(housereg)
regpredict=predict(housereg,housetestimp[-49])
# Activate rpart package for Decision Tree
houserpart=rpart(SalePrice~.,data=housetrainfinal[-49])
rpratpredict=predict(houserpart,housetestimp[-49])
write.csv(rpratpredict,"rpartpred.csv")
# Activate randomForest package
houseRF=randomForest(SalePrice~.,data=housetrainfinal[-49],
ntree=3000,do.trace=100)
RFpredict=predict(houseRF,housetestimp[-49])
write.csv(RFpredict,"RFpred.csv")
# activate package "gbm"
housegbm=gbm(SalePrice~.,data=housetrainfinal[-49],
n.trees=1000,cv.folds = 3)
gbmpred=predict(housegbm,housetestimp[-49],)
write.csv(gbmpred,"gbmpred.csv")

You might also like