Download as pdf or txt
Download as pdf or txt
You are on page 1of 13

Tables

Frequency Table
1. Method 1
a. freq_var <- df %>% count(var) #count can have more than 1 argument
b. freq _var <- spread(freq_var, key = ‘differentiator’, value = n) #OPTIONAL:
only if there is more than one variable – e.g comparing genders(var 1) for
different events (var 2), then differentiator is gender
c. kable (freq_var)
Table to compare mean, sd, etc
1. tab <- df %>% group_by(group) %>% summarise(mean = mean(df$var), SD =
sd(df$var), Total = sum(df$var)) #the arguments of summarise can be either/or, and
can also be more than 1 at once
2. colnames(tab) <- c(‘header1’, header2’, ‘header3’) #OPTIONAL
3. kable(tab)

Contingency Table
1. Using rpivotTable
a. data <- df %>% select(a, b, c) # optional, depends on qn
b. rpivotTable(data, rows = ‘sub category’, cols = ‘what you are comparing
against’, values = “n”, aggregatorName = “Count”)
2. Using kable
a. dt1 <- dt %>% group_by(a, b) %>% tally()
b. dt1.spread <- dt1 %>% spread(key = differentiator, value = n)
c. dt1.spread[is.na(dt1.spread)] <- 0 #convert NA to 0 value OR omit NA values
d. kable(dt1.spread, caption = "title")
Data Visualisation
Histogram
1. par(mfrow = c(2, 2)) #optional: to see multiple histograms in one slide (usually use
when doing ANOVA)
1. hist(df$var, main = “title”, xlim = c(lower, upper), ylim = c(lower, upper), labels =
TRUE, xlab = “x-axis label”, ylab = “y-axis label”) #y-axis: frequency and x-axis:
variable

Boxplot
1. var.box <- boxplot(df$var, horizontal = TRUE/FALSE, range = x, main = ‘title’, xlab =
‘x-axis label’, na.rm = TRUE)
2. var.box$out #returns the outlier values
Bar Chart:
1. Ungrouped:
a. par(mar = c(4.1(b), 7(l), 4.1(t), 1.9(r))) #optional, to increase margin size
b. bar <- barplot(data$var, names.arg = data$group, xlim = c(min, max), horiz =
TRUE/FALSE, las = 0/1/2/3, cex.names = 0.5, xlab = “x-axis”, main = “title”)
#las is optional, to change orientation of axis labels (0: always parallel to axis,
1: always horizontal, 2: always perpendicular to axis, 3: always vertical)
2. Grouped:
a. Create frequency table
b. bar <- as.matrix(freq_tab[, c(col1, col2)]) #col1 to col2 are what you want to
the variable that you want to cluster the elements by in the bar chart
c. col <- rainbow(length(freq_tab$group)) #variable is what you want to group
by(main comparison between variables)
d. barplot(bar, beside = TRUE/FALSE, horiz = TRUE/FALSE, col = col, main =
“title”, xlim = c(lower, upper), ylim = c(lower, upper)) #beside = FALSE for
stacked bar chart, #horiz = TRUE for horizontal bar chart
e. legend(‘topright’, cex = 0.55, fill = col, freq_tab$group)
Pie Chart:
1. piepercent <- 100*round(data/sum(data), 2) #calculate percentages, round is optional
2. slices <- data
3. label <- categorical_data
4. label <- paste(label, “,”, sep = ‘’) #must be in order
5. label <- paste(label, piepercent)
6. label <- paste(label, “%”, sep = ‘’)
7. pie(slices, labels = label, col = c(“col1”, “col2”, “col3”), radius = 1, main = “title”)

Line Chart #e.g here is mean for all countries across a period of time
1. line <- describe(df) %>% select(mean) %>% filter(mean > 4 & mean < 90) #last part
is just to select rows containing means of years
2. plot(line$mean, x = rownames(line), type = “o”, col = “red”, xlab = “x-axis”, y =
“y-axis”, main = “title”)
Plots
1. Density plot
a. plot(density(data$var))
2. Normal Q-Q plot
a. qqnorm(data$var, main = “title”)
b. qqline(data$var, col = 2)

3. Linear Regression Plots #use to check assumptions of linear regression


a. plot(model, 1): Residuals vs Fitted plot
i. Linearity of data: check if residuals scatters around y = 0
ii. Homoscedasticity (constant error variance):
1. Fanning effect #means heteroskedasticity present
2. Funnel Effect #means heteroskedasticity present
iii. Independence of errors: check if data is scattered randomly
b. plot(model, 2): Q-Q plot
i. Normality of errors: check if shape of graph is linear
c. Scatter plot for qualitative variables regression #var1 is the categorical
variable, var2 is the non categorical variable
i. colours <- c(var1a = “red”, var1b = “black”, var1c = “green”) #sub
categories of categorical variable
ii. plot(df$var2, df$response, col = colours[df$var1], xlab = “x-axis”, ylab
= “y-axis”, main = “title”)
iii. legend(“topright”, legend = levels(df$var1), fill = colours)
iv. Plot line for subset 1a #repeat for all other subsets
1. df_var1a <- filter (df, var1 == "var1a")
2. fit_var1a <- lm(response ~ var2, data = df_var1a)
3. abline(fit_var1a, col = "red")
v.
Outlier Analysis (do both)
1. Histogram
a. Right-skewed or Left-skewed: indicates outlier values (at the tail end)
2. Boxplot
a. Can see exactly how many outliers
Hypothesis Testing
One Sample Test
1. Proportion (z-statistic)
a. subset <- filter(condition)
b. prop.subset <- nrow(subset)/nrow(df)
c. prop_z.stat <- (prop.subset - predicted/ sqrt(predicted*(1-predicted)/nrow(df))
#z-statistic
d. (critical value) qnorm(significance level) #left-tailed test OR qnorm(confidence
level) #right-tailed test OR qnorm(confidence level - significance level/2)
#two-tailed test
e. Reject H0: z-stat < (-) critical value #left-tail OR z-stat > critical value
#right-tail OR abs(z-stat) > critical value #two-tailed
2. Known population sd (t-test)
3. Unknown population sd (t-test)

Two Sample Test # take note of the alphabetical order! Lower always on LHS
1. Paired t-test #used when means of 2 samples are naturally matched: e.g.
measurement taken at two different times
a. t.test(data$sample1, data$sample2, paired = TRUE)
2. Population variance unknown # alternative corresponds with H1, which is what you
want to check (e.g. check if µ1 more than µ2, then H0: µ1 - µ2 <= 0 and H1: µ1-µ2 >
0, and alternative = “greater”)
a. t.test(var1 ~ var2, alternative = “less/greater/two-sided”, data = data)
3. Population variance known
a. t.test(var1 ~ var2, alternative = “less/greater/two-sided”, data = data, var.equal
= TRUE)

Confidence Interval #associated with sampling distribution of a statistic


1. For mean with unknown population sd
a. lCI <- mean(df1$var) +/- qt(0.025, df =
nrow(df1)-1)*sd(df1$var)/sqrt(nrowdf1))
b. uCI <- mean(df1$var) +/- qt(0.025, df =
nrow(df1)-1)*sd(df1$var)/sqrt(nrowdf1)) #0.025 depends on the confidence
they ask for, in this case it's 95% confidence interval since 0.05/2 = 0.025
c. print(cbind(lCI, uCI), digits = 3) #digits refers to sf
d. Interpretation: The x% confidence interval for mean is [lCI, uCI]. This means
that there is x% probability that the population mean falls within lCI and uCI.
2. For mean with known population sd
a. lCI <- mean(df1$var) +/- qnorm(0.025)
b. uCI <- mean(df1$var) +/- qnorm(0.025) #0.025 depends on the confidence
they ask for, in this case it's 95% confidence interval since 0.05/2 = 0.025
c. print(cbind(lCI, uCI), digits = 3)
d. Interpretation: The x% confidence interval for mean is [lCI, uCI]. This means
that there is x% probability that the population mean falls within lCI and uCI.
3. Proportion
a. df1 <- df %>% filter(condition)
b. phat <- nrow(df1)/nrow(df)
c. lCI <- phat +/- qnorm(0.025)*sqrt(phat*(1-phat)/nrow(df)))
d. uCI <- phat +/- qnorm(0.025)*sqrt(phat*(1-phat)/nrow(df))) #0.025 depends on
the confidence they ask for, in this case it's 95% confidence interval since
0.05/2 = 0.025
e. print(cbind(lCI, uCI), digits = 3)
f. Interpretation: we can be x% confident that the proportion is (given
proportion) as it falls within the x% confidence interval

Prediction Interval #associated with the distribution of the random variable itself
1. check normality
a. qqnorm(df$var, ylab = “Sample Quantiles for `var` for (group)”
b. qqline(df$var, col = “red”)
c. shapiro.test(df$var)
2. Transform data if data is not normally distributed
a. df$var.t = transformTukey(df$var, plotit = TRUE)
b. mnvar.t <- mean(df$var.t)
c. sdvar.t <- sd(df$var.t)
d. lPI_var.t <- mnvar.t +/- (qt(0.025, df = nrow(df)-1))*sdamt.t*sqrt(1+1/nrow(df))
e. uPI_var.t <- mnvar.t +/- (qt(0.025, df = nrow(df)-1))*sdamt.t*sqrt(1+1/nrow(df))
#0.025 depends on the confidence they ask for, in this case it's 95%
confidence interval since 0.05/2 = 0.025
f. cbind(lPI_var.t, uPI_var.t)
3. Reverse transformation
a. lPI_var <- #equation depends on lambda value
b. uPI_var <- #equation depends on lambda value
c. print(cbind(lPI_var, uPI_var, digits = 3))
Linear Regression
For ALL regression models:
1. F-stat p-value < 0.05: can reject H0 and conclude that at least one of the predictor
coefficients is significantly different from 0
Simple Linear Regression
1. model <- lm(response ~ predictor, data = data)
2. summary(model)
3. Interpretation
a. ß0: average value of response when predictor is 0
b. ß1: for every 1 unit increase in predictor, ß1 is the change in the response,
holding all other variables constant
c. F-stat p-value < 0.05: relationship between response and predictor is
significant at 5% level of significance
d. Adjusted R-Squared:
i. 0 < R2 < 1: 1 indicates perfect fit, 0 indicates no relationship exists

Multivariate Model
1. model <- lm(response ~ predictor1 + predictor2, data = data)
2. summary(model)
3. Interpretation
a. ß0: average value of response when predictor is 0
b. ßi: for every 1 unit increase in predictor, ßi is the change in the response,
holding all other variables constant
c. F-stat p-value < 0.05: response has a linear relationship with at least one of
the predictors at 5% level of significance
d. Adjusted R-Squared:
i. 0 < R2 < 1: 1 indicates perfect fit, 0 indicates no relationship exists

Categorical Variable (Dummy variable)


1. Without interaction term
a. model <- lm(response ~ predictor1 + predictor2, data = data)
b. summary(model)
c. Interpretation
i. ß0: average change in response when predictor is 0
ii. ßi: for every 1 unit increase in predictor, ßi is the change in the
response, holding all other variables constant #non categorical
iii. ßj: the change in response relative to reference level when predictor is
xxx #categorical
iv. Adjusted R-Squared:
1. 0 < R2 < 1: 1 indicates perfect fit, 0 indicates no relationship
exists
2. With interaction term
a. model <- lm(response ~ predictor1 + predictor2 + predictor1*predictor2, data
= data)
b. summary(model)
c. Interpretation:
i. ßk: slope (with interaction) of (categorical variable) is much more
negative/ positive than slopes among other variables of category. For
(categorical variable), every additional unit of (interaction variable),
the average (response) decreases/ increases by (ßk + ßj), where ßk is
the coefficient of the interaction variable #coefficient of interaction
term, increases or decreases depends on sign of (ßk + ßj)
3. Changing reference level
a. df$var <- factor(df$var)
b. df$var <- relevel(df$var, ref = “new ref”)

Logistics regression #when question has to do with PROBABILITY


1. model <- glm(response ~ predictor1 + predictor2 + predictor3, data, family =
“binomial”)
2. summary(model)
3. Interpretation
a. Model equation:
i. log-odds(response) = ß0 + ß1 predictor1 + ß2 predictor2
b. ß0: when all predictors = 0, the odds of default = e^(ß0)
c. ßi: for every 1 unit increase in predictor, ßi is the change in the log-odds of
response, holding all other variables constant #non categorical
d. ßj: ßj is the change in the log-odds of the response as compared to the
reference level #categorical

4. Confusion Matrix
a. Create model
b. Predict probability using model #basically steps a and b in Predicting
probability using logistic regression model
c. confusionMatrix(factor(pred_response), factor(df$response), positive =
“1/Yes”) # after fitting model
d. Interpretation
i. Sensitivity: Proportion of predicted yes out of actual yes
1. TP/(TP+FP)
ii. Specificity: Proportion of predicted no out of actual no
1. TN/(FP + TN)
iii. Precision: Proportion of actual yes out of predicted yes
1. TP/(TP + FP)
iv. Accuracy: Proportion of correct classification
1. (TP + TN)/(TP + FN + FP + TN)
v. F1-Score: balanced index between precision and sensitivity
1. (2*Precision*Sensitivity)/(Precision + Sensitivity)
Predicting
1. Multivariate Model:
a. new <- data.frame(predictor1 = x, predictor2 = y)
b. predict(model, newdata = new)

2. Categorical Variable:
a. new <- data.frame(predictor1 = x, predictor2 = as.factor(1 or 0))
b. predict(model, newdata = new)
3. Logit model
a. new = data.frame(predictor1 = x, predictor2 = y) # if predictor is yes/ no, then
predictor = “yes/no”
b. predict(model, newdata = new, type = “response”)

ANOVA test for linear regression model #only for multivariate regression models
H0: ß1 = ß2 = ß3 = 0
H1: at least one ßi != 0
1. aov1 <- aov(model)
2. print(summary(aov1))
3. Interpretation
a. SSM: sum of `Sum Sq` for each predictor #sum of squares model
b. SST: sum of `Sum Sq` for each predictor + Residuals #total sum of squares
i. SST = SSM + SSR (residual sum of squared)
c. Coefficient of determination (R-squared) = SSM/SST
d. Convert R-squared to percentage
i. x% of the total variation in (response) is explained by predictor1,
predictor 2 #add on/ remove accordingly

Scatter Plot Matrix


1. scatter_matrix <- pairs.panels(data, main = “Scatter Plot Matrix)
Data Mining
PCA
1. Conduct PCA
a. df1 <- df %>% select(var1, var2, var3)
b. pca <- prcomp(formula = ~ var1 + var2 + var3, data = df1, center = TRUE,
scale = TRUE)
c. pca$rotation #if you want to get linear combination of principal component
loadings
i. Coefficients for PCi are the numbers in the ith column
d. summary(pca) #use to determine the top k principal components
i. Determine the top k principal components by obtaining k number of
proportion of variation that sum up to at least 95%
2. Logistic regression classifier
a. df1 <- cbind(df1, pca$x[, col:col]) #to extract the PCA that you want and add
to the data frame
b. model <- glm(response ~ PC1 + PC2, data = df, family = “binomial”)
c. summary(model)

3. Predict probability using logistic regression model


a. pred_prob <- predict(model, type = “response”)
b. pred_response <- ifelse(pred_prob >= x, 1, 0) # x should be given, “>=” will
depend on the question as well
c. sum(pred_response == 1)
d. sum(pred_response == 0)

4. Visualise PCA (PCA biplot)


a. biplot(pca) #gives the composition of the top 2 PCs wrt to all predictors
b. Interpretation:
i. Majority of the length of variables is on the PC1 axis: PC1 captures
the largest variation
ii. Majority of the length of variables is on the PC2 axis: PC2 captures
the largest variation

Stepwise model selection


1. Backward
a. model <- lm(response ~ predictor1 + predictor2 + predictor3, data = df)
b. bmodel <- step(model, direction = “backward”)
c. summary(bmodel)
2. Forward
a. model_intercept <- lm(response ~ 1, data = df)
b. fmodel <- step(model_intercept, scope = ~ predictor1 + predictor2 +
predictor3, direction = “forward”)
c. summary(fmodel)
3. Interpretation
a. Compare final selected models: if they are exactly the same, the two models
agree
b. Compare adjusted R-squared value between forward and backward stepwise
selection model
i. Same: the two models are the same in terms of goodness of fit
ii. Different: the model with the higher value is better in terms of
goodness of fit

Linear Optimisation
Table #write out what each decision variable is before doing the table
Maximise total profit using decision variables $X_1$, $X_2$ | Profit = 0.15 $X_1$ + 0.40
$X_2$
--- | ---
Subject to |
Budget Constraint | 0.20$X_1$ + 0.70$X_2$ $\leq$ 100
Space Constraint | $X_1$ + $X_2$ $\leq$ 200
Non-Negativity Constraint 1 | $X_1$ + $\quad$ $\geq$ 0
Non-Negativity Constraint 2 | $\quad$ + $X_2$ $\geq$ 0

Optimal solution
1. objective.fn <- c(a, b)
2. const.mat <- matrix( c(b, c, d,
s, t, u), ncol = #no. of decision variables, byrow = TRUE)
3. const.dir <- c(“<=”, “>=”) #should correspond to no. of rows
4. const.rhs <- c(p, q)
5. lp.soln <- lp(objective.fn, const.mat, const.dir, const.rhs, compute.sens = TRUE)
6. lp.soln$solution #gives coefficient of decision variables
7. lp.soln #gives objective function

Range of objective function coefficient/ Sensitivity analysis # use whenever they ask to
adjust the coefficient of objective function
1. range_objcoef = cbind(lp.solution$sens.coef.from, lp.solution$sens.coef.to)
2. rownames(range_objcoef) = c(‘coef1’, ‘coef2’, ‘coef3’)
3. colnames(range_objcoef) = c(‘from’, ‘to’)
4. print(range_objcoef)
5. Interpretation (Question: What is the maximum/minimum value where the current
solution remains optimal)
a. If the (value) is strictly more than/ less than (max/min), current solution is no
longer optimal
b. Contextualised e.g: When risk measure is strictly more than 7.221150, it
becomes optimal for the investment company to cease investing in mortgage
funds, holding all other variables constant (more general context: should be
strictly lesser than lower bound or strictly more than upper bound).
Shadow prices
1. lp.solution$duals
2. Interpretation
a. If shadow price = 0: non binding
b. If shadow price > 0: binding
i. Increasing rhs of constraint by 1 unit will lead to an increase in
(objective function) by (shadow price)
c. If shadow price < 0: binding
i. Decreasing rhs of constraint by 1 unit will lead to a decrease in
(objective function) by (shadow price)

Reduced cost
1. Definition: How much the objective coefficient needs to be reduced for a
non-negative variable that is zero in the optimal solution to become positive.
2. lp.solution$duals # dual variable of the objective coefficient corresponds with the
order of the constraints
3. Interpretation
a. Reduced cost = (objective coefficient) - (dual variable)
b. Objective coefficient must be reduced by more than (dual variable), and the
objective function must be below/ above reduced cost, holding all other
variables constant #reduced cost positive: below, reduced cost negative:
above
Integer Optimisation
When X is binary
Defining Constraints
1. X1 is a prerequisite for X2
a. X1 - X2 >= 0
2. X2 is a prerequisite for X3
a. X1 - X2 >= 0 AND # x1 is a prerequisite for x2
b. X2 - X3 >= 0
c. Overall: X1 - X3 >= 0 # add the two equations together
3. Prerequisite for Y3 is EITHER Y1 OR Y2
a. Y1 - Y3 >= 0 OR
b. Y2 - Y3 >= 0
c. Overall: Y1 + Y2 - Y3 >= 0
4. Prerequisite for Y3 is Y1 AND Y2
a. Y1 - Y3 >= 0 AND
b. Y2 - Y3 >= 0
c. Overall: Y1 + Y2 - 2Y3 >= 0

Optimal Solution
1. objective.fn <- c(a, b)
2. const.mat <- matrix( c(b, c, d,
s, t, u), ncol = #no. of decision variables, byrow = TRUE)
3. const.dir <- c(“<=”, “>=”) #should correspond to no. of rows
4. const.rhs <- c(p, q)
5. lp.soln <- lp(objective.fn, const.mat, const.dir, const.rhs, int.vec = c(1:n)) #n = no. of
decision variables
6. lp.soln$solution #gives coefficient of decision variables
7. lp.soln #gives objective function

Others
Creating a new variable in a data frame based on another variable
1. CS$firm <- NA
2. CS$firm[CS$indus == 1] <- "1"
3. CS$firm[CS$finance == 1] <- "2"
4. CS$firm[CS$consprod == 1] <- "3"
5. CS$firm[CS$utility== 1] <- "4"

To get table of mean, standard deviation, standard error, median, skew, kurtosis etc
1. tab <- describe(data$var)
2. tab$trimmed <- tab$mad <- tab$min <- etc <- NULL # to remove any unwanted
columns from describe OR select(mean) #to select a specific column
3. kable(tab, row.names = FALSE, caption = “title”)
Correlation test (significance of correlation)
1. cor.test(df$var1, df$var2) # to determine significance between 2 variables (can be >
2)
a. Interpret p-value to determine significance
2. cor(df$var1, df$var2) #to determine correlation coefficient between 2 variables
3. df %>% select(var1, var2, var3) %>% corr.test #determine correlation value for more
than 2 variables
Coefficient of Skewness
1. skew(var) #peak on left side: positively skewed (+ve CS), higher magnitude = more
skewed

Kurtosis
1. kurtosi(var) #cut off in R is 0 instead of 3. CK < 0 indicates flat with wide range of
dispersion

Pareto analysis
1. df.var <- df %>% select(var) %>% arrange(desc(var)) %>% filter(var>0) # filter step is
to remove na values, so condition will depend on the question
2. df.var$Percentage <- df.var$var/ sum(df.var$var) #compute var over sum of var
3. df.var$Cumulative <- cumsum(df.var$Percentage) #compute cumulative % of var
4. var80 <- which(df.var$Cumulative > 0.8)[1] #compute number with top 80% var.
Note: 0.8 depends on the split that the question asks for, in this case its an 80-20
split
5. var80/ nrow(df.var) #compute % with top 80% var

Compute probability
1. pnorm(800, mean = m, sd = s, lower.tail = FALSE) #Probability that demand > 800,
MUST BE NORMALLY DISTRIBUTED

You might also like