Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 10

CLEAR CONSOLE Cntrl+L

Run code in ediror Cntrl+Enter


New line in editor Enter
BASIC CODE
IMPORT CSV FILES Table_name<-read.csv(file.choose())
atm<-read.csv(file.choose())
Awoke a cln seperatly one_year_return<-retire$X1YrReturn.
var_name<-tn$cl_name_in table
SUBSETTING create table with only one type of value Growth<-subset(retire,retire$Type=="Growth")
like growth
Finding unique value in any column unique(retire$Type)
First few rows head(atm) #n rows
List of all colums names colnames(atm)
names(tn)
##check the srtucture of the data frame str(tn)
Add new cln New_tn<-cbind(old_tn,variable_name)
new_brandz<-cbind(brandz,clean_brand_value)
atm2$URBAN<-NULL
atm2$X<-NULL
atm$new<-c(1:140)
Delete rows atm1<-atm[-c(1,2), ] #row 1 and 2
atm2<-atm[ ,-c(1,2)] # column 1 and 2
# to delate column with null value
Number of elements in column N<-length(value_nm)
Types of variable
#numerical- numeric, integer, complex is.numeric(b)
# text- character, factor is.character(gender)
#boolean/logical-True, False is.integer(b)
a<-"monday" is.logical(b)
b<-47 is.factor(b)
nominal cardinal integer scale ratio scale is.complex(b)
CHANGE TYPE OF VARIABLE gender<-c("male","female")
##coerce as a factor genderfactor<-as.factor(gender)
##coerce as numeric gender_numeric<-as.numeric(genderfactor)
##type of data class(rbi_atm_data)
# remove the NAs##type of data total_urban<-sum(rbi_atm_data$URBAN,na.rm=T)
STATS FORMULAE
Sqrt(value)
MEAN M<-mean(value_nm)
basic_model&lt;-cardio[c(1:80),]
Mean of particular section of data mean(basic_model$Age)
OR
mean(cardio$Age[cardio$Product ==
&quot;TM195&quot;])
STANDARD DEVIATION S<-sd(value_nm)
Z a/2 # default Q<-qnorm(p,mean=0,sd=1,lower.tail=TRUE)
Ta/2 T<-qt(p.df,lower.tail=TRUE)
t.test(Growth_1yr$X1YrReturn.,conf.level
T.TEST #default = .95)$conf.int
PACKAGES
###install the package install.packages("prob")
##calling the package for action library(prob)
Toss coin sample space tosscoin(3)
Sample space with probability tosscoin(3,T)
tosscoin()
##rolldie(times, nsides,makespace) rolldie(2)
##makespace=true means probabilities of outcomes rolldie(2,nsides=8,makespace = T)
will also be reported
# false is default
###urnsamples default is replace=false and urnsamples(c(1:9),2,replace = F,ordered = F)
ordered=false urnsamples(c(1:9),2, replace = T)
##equivalent to 9 Choice 2 urnsamples(c(1:9),2, replace = T, ordered=T)
urnsamples(c("H","T"),2,replace = T,ordered=T)
abc<-rep(c("red","green","yellow"), times=c(3,2,4))
urnsamples(abc,2)
urnsamples(rolldie(3),1)
urnsamples(rolldie(3),2)
ABC<-rep(c("red","blue","green"),times=c(2,4,3))
urnsamples(ABC,2,replace = T,ordered = T)
urnsamples(ABC,2,replace = T,ordered = F)
CLEAN DATA
convert cln type to numeric New_cln<-as.numeric(tn$old_cln)
Check for na is.na(clean_brand_value)
No of na sum(is.na(clean_brand_value))
Remove na na.omit(clean_brand_without_nas)
CORRECT TYPOS df1<-c("male","mail","femail")
df2<-rep(c("male","femail","mail","female"),c(1:4))
VISUALIZATION
To check margin of plot output window par("mar")
#default margin is usually 5.1,4.1,4.1,2.1 for BLTR par(mar=c(1,1,1,1)) # to change mar
To draw histogram hist(OYT) #automatically takes equal bins sizes
hist(OYT, breaks=6) # to change no of equal bins
breaks defind as break point h<-hist(OYT, breaks= c(-25,-15,-5,5,15,25))
#text(x,y,labels= ,adj= )adj- adjust text position, # object h helps as printing h gives details of hist
#adj takes values 0 to 1 text(h$mids,h$counts,labels=h$counts, adj=c(0, 1))
UNEQUAL BREAKS hist(OYT, breaks= c(-20,-9,2,24)) # will take densities
hist(OYT, breaks=-25:25) sequence of integers
## again does not always work exactly
hist(OYT, breaks= c(-20,-9,-1.5,2,3,4,5,6,7,15,24), xlim=c(-
##k = [log2n + 1], Sturges formula(default setting) 25,25), ylim=c(0,0.1))
### reporting relative densities
##freq=False, default is True for equal classes
hist(OYT,main=”Distribution of 1 yr returns” ,xlab=”r”,
ylab=”R”, freq=F)
###to add a line/curve for density
##add the following after the histogram command

lines(density(OYT), lty = 1)
##lty means “line style” 0 means no line, 1 means solid,
lines(density(OYT), lty = 1)
###to get just density of a distribution
d<-density(OYT)
plot(d)
#text(x,y,labels= ,adj= )adj- adjust the text position,
text(h$mids,h$counts,labels=h$counts, adj=c(0.5, -0.5))
BOX PLOT boxplot(OYT, horizontal = T,
xlab=”1 yr returns”,col=”green”, range=0)
with whiskers going to the entire range of the boxplot(x,y,z ,names=c(“x”,”y”,”z”), xlab=”r “, ylab=”a “,
distribution ylim=c(-15,35), col=c(“magenta”,”green”,”yellow”),
range=1.5)

boxplot(x, y ,z ,xlabel=”a”,main=”TITLE
“,col=c(“A”,”B”,”C”), names=c(“8”,”9”,”2”),
ylab=”D”,ylim=c(0,10))

text(y=c(fivenum(X),fivenum(Y),fivenum(Z)),labels=c(fiv
enum(X),fivenum(Y),fivenum(Z)),x=c(.4,1.5,2.5),cex=0.5)
DESCRIPTIVE STATISTICS
Mean OYT<-retire$X1YrReturn.
Median MeanOYT<-mean(OYT)
Trim mean medianOYT<-median(OYT)
trimOYT10<-mean(OYT,trim = 0.10)
SUMMARY # mean median min max 1 quarter and 3 rd summary(OYT)
quarter mean
install.packages(“moments”) skewness(OYT)
library(moments) # TO IMPORT LIBRARY kurtosis(OYT)
VAARIENCE var(OYT)
STANDARD DEVIATION sd(OYT)

SAMPLING
sample of size 50 from normal distbn mean 100 sd 15 x <- rnorm(50, mean=100,sd=15)
many samples of equal size from normal distribution replicate(1000,rnorm(50))
1000 samples of size 50 each t(replicate(1000,rnorm(50)))
##transpose for better view of 50 samples
sample from Poisson y<-rpois(50,lambda=1.2)
sample from binomial s3<-rbinom(n=6,size=5,0.62)
n=6, size= 5 trials experiment, p=(prob success
sampling from hypergeometric, rhyper(nn=6,m=3,n=4,k=2)
Sampling from uniform distribution runif(10,3,8)
Sampling from exponential distribution rexp()
sampling distribution of sample mean Means1<-replicate(1000,mean(rnorm(50)))
# each sample has a mean, hist(means1)
#1000 means stored in the object "means1"
##taking many samples of equal size replicate(1000,rexp(50))
#from exponential distribution
##1000 samples of size 50 each
###sampling distribution of sample mean means_exp<-replicate(1000,mean(rexp(50,rate=3)))
# each sample has a mean, hist(means_exp)
#1000 means stored in the object "means_exp" mean(means_exp)
sampling distribution of sample median medians_norm<-replicate(1000,median(rnorm(50)))
med<-density(medians_norm)
plot(med)
sampling distribution of sample variance variances <- replicate(10000, var(rnorm(50)))
Check E(s^2)= sigma^2 d<-density(variances)
mean(variances) plot(d)
S<-c(1:5)
means_unif<-
replicate(100,mean(sample(S,3,replace=TRUE)))
j<-density(means_unif)
plot(j)
CONFIDENCE LEVEL AND HYPOTESIS TESTING
confidence interval population mean, population L=qnorm(.025,lower.tail = T)
distribution and sigma unknown, m=sample size large, U= qnorm(.025,lower.tail=F)
#using qnorm(p,lower.tail=T or F)#True is the default W=u-l
CI<-c(L,U)
population distribution normal, sigma unknown, sample U=qt(p=.025,df=n-1, lower.tail =T )
size does not matter, no need of CLT L=qt(p=.025,df=n-1, lower.tail =F )
x-bar - mu/(s/sqrt(n)) is EXACTLY t distributed CI2<-c(lower1yrwith_t,upper1yrwith_t)
##using qt(p=,df=,lower.tail=)
population distribution normal, sigma unknown, sample t.test(Growth_1yr$X1YrReturn.)$conf.int
size does not matter
#### another way of getting confidence interval when t.test(Growth_1yr$X1YrReturn.,conf.level
using t.test (95% CI default) = .99)$conf.int
CI for population proportion(pi) where sample N=no of sample
proportion X=no of success
##is p(Levine et al) or p hat(Devore) P=x/n
##using prop.test(x,n,...)default con level=95% prop.test(?,?)$conf.int
prop.test(?,?, conf.level =.90 )$conf.int
EXCEL SHORTCUT

Close excel Alt+F4


Close sheet Control + W
Open new excel book Control + N
ZOOM Alt+W+Q
Quick access toolbar Alt+ Up arrow
To move between all bars F6
Status bar Insert + End for Joss Insert + Page down
To move to any cell through go to Control + G
To move to last row or column Control + Up arrow / right arrow
To move between columns Right arrow or tab
To move btw rows Down arrow or enter move in same cell if column navigated
by arrow ti col when tab started if tab used to move btw
columns
Edit cell F2
To select entire column Control + Space / Control + Shift + down arrow
To select entire row Shift + Space / Control + Shift + down arrow
Select specific rows and column Control + G and type A1:d10
To inc row height Control + Enter or through applicationkey
For line break in same cell Alt + Enter
Insert column/row in left hand side row Control+ (Shift + = ) / numpad +/ Alt + I + C
above
Delete cell row column Control (-)/ numpad - / Alt + I +R
Column header Insert+ control+ alt+ C/ insert+ shift +c nvda
Row header Insert+ control+ alt+ R/ insert+ shift +R nvda
Insert sheet Shift + F11
Delete sheet Alt + H +D + X
Move between sheets Control + Pa down
Height Width Hide Unhide rename sheet Alt + H + O
For sheet list Control + Shift + S for JAWS
For zoom excel Control + scroll mouse / Alt + B + Z
TO copt formulae to all cell Select cell Control + D for down R for Right l
Cell format Control + 1
Series Dialogue Box Alt+ H + F + I + S
Sort data Alt+H+S
Insert chart Alt + H + C
Control + shift + O JAWS Insert + Page down for status in Joss insert + end for NVDA
C for clear filter
FORMULAE
SUM #kaccess key ALT+ = =SUM(Range)
SUMIF- in condition if character use “ “ for =SUMIF(criteria-range,condition,sum-range)
number no “ “ and “ “ if > =SUMIF(B2:F2,”>100”,B2:F2)
Use & for < cell and < function =SUMIF(B2:F2,”>”&B2,B2:F2)
Sumif(range_criteria,criteria,sum_rang) =SUMIF(B2:F2,”>”&MIN(B2:F2),B2:F2)
Countif =count(range)
Criteria is not case sensitive =countA(range)
=countif(range,”>”*a1)
=counif(range,”male”)
IF =if(logical,value_if_TRUE,value_if_FALSE)
=IF(B4>30,"m","y")
If Loop =IF(A4="T","B",IF(A4="M","M",IF(A4="8,"T","False")))
If with and and or =IF(AND(C4="Male",E4="Single"),"Male
Single",IF(AND(C4="Male",E4="Partnered"),"Male
Partnered",IF(AND(C4="Female",E4="Single"),"Female
Single","Female Partnered")))
And AND(C4="Male",E4="Single")
OR OR(C4="Male",E4="Single")
CLEANING AND ORGANIZING DATA
TYPE OF VARIABLE =ISNUMBER(B4)
Blank identify =ISBLANK(D2)
VLOOKUP =VLOOKUP(A18,$A$2:$J$12,7,0)
=VLOOKUP(<RESPECT CELL>,<DATA RANGE>,INDEX OF
COLUMN TO EXTRACT>,0)
TO CORRECT TYPO ERROR ALL ENTIRES IN =VLOOKUP(D2,$J$2:$K$6,2,FALSE)
1ST COLUMN OF REFERENCE ALL CORRECT
SPELLING IN 2 COLUMN OF REFERENCE ,
REFERENCE CREATE TABLE

FREQUENCY DISTRIBUTION In second column select all cell where frequency displayed
Right bin sizes in first column type =FREQUENCY(range of original data,range of cell of
After formulae Press Cntrl +Shift +Enter bin value)
PIVOT TABLE Insert pivot table
Right click cel and change show value as totoal
Show value as column total or row total means percentage
distribution in no of male, female income
Double click on any table to see all data
Insert and slicers right click and remove slicers
DESCRIPTIVE STATISTICS
Average =AVERAGE(C24:C63)
Data>data_analysis>descriptive statistics> =TRIMMEAN() # round mean removing top 10% value
summary statistics =GEOMEAN() # growth rate of mean, time series data
MODE =MODE.MULT()
=MODE.SNGL()
=MODE(range)
Min max =min(range)
=max(range)
Median =quartile(range,2)
=median(range)
Quartile =quartile(range,1)
Standard deviation =standardize() #for standardizing values
=stdev.s(range)
=stdev.p(range)
VARIENCE =VAR.S(range)
=VAR.P(range)
Kurtosis =kurt(range)
SKEWNESS =SKEW(range)
RANDOM SAMPLING =RAND() #A random value btw 0 and 1
Gives a random value =randbetween(bottom,top) #rand btw top bottom
Empirical rule for bell shaped frequency 68% data at mean-1*sd mean+1*stdev distance
curve 95% data at mean-2*sd mean +2*stdev distance
Chebyshey’s theorem or inequality
Propotion=1−
1
|
k2 1
<k < 4 btw k sd distance from both

sides of mean
PROBABILITY DISTRIBUTION
BINOMIAL =BINOM.DIST(x, n, p, cumulative) #true cumulative
=BINOM.INV(n,p,alpha)#return x value for alpha prob
#low_range not include up_ran include =BINOM.DIST.RANGE(n, p, low_range,up_range)
POISSON =POISSON.DIST(x, mean, cumulative) #true cummulative
BERNOLII =
NORMAL.S #standard normal mean=0, sd=1 =NOR.S.DIST(z, cumulative) #true cumulative false height
=NORM.S.INV(alpha)
=NORM.DIST(x, mean, sd, cummlative)
=NORM.INV(alpa, mean, sd)
T test =T.DIST(x, degree_freedom, cmmlative)
=T.INV(p, degree_freedom)
SAMPLING
SAMPLING Alt +A+ Y+ 2 data >data analysis> random number generation
Number of variables -no of different sample number of random number – sample size
CENTRAL LIMIT THEROM Sample mean of n sample is normally distributed
Mean of all sample means= mean of population
Varience of all sample mean=population variance /n
Mean of all sample varience= poplation variance
α z α ∗σ z α ∗σ 2∗z α ∗σ
CONFIDENCE INTERVAL z α , x=1−
2
2 U =x+ 2
, L=x− 2
, RANGE= 2

when α increases width decreases √n √n √n


Confidence interval population sd unknown t α ∗s t α ∗s 2∗t α ∗s
Degree of freedom=n-1, Standard error= 2 2 2
U =x+ , L=x− , RANGE=
s √n √n √n
√n

( ) ( ) ( )
SAMPLE SIZE determination to reduce α z α ∗σ 2
z α ∗r 2
z α ∗1 2

σ known sample error=e , σ known 2 , 2 2


n= n= , n= p( 1− p)
range/6=r ⅇ ⅇ ⅇ
For a sample propotion σ unknown
test for normal distribution 1. Histogram looks bell shaped
if 6 or 7 are satisfied it is normally 2. Box plot looks symmetrical
distributed 2
3. p [ x −s , x + s ] =contains rd observation
3
4.
4
p [ x −2 s , x+ 2 s ] =contains rd observation
5
5.
19
p [ x −3 s , x +3 s ] =contains rd observation
20
6. IQ=1.33*s
7. RANGE=6*s
8. SKEWNESS is abot 0
9. KUTOSIS excess= KURTOSIS-3 is about 0
HYPOTHESIS TESTING Z TEST WHEN N LARGE SIGMA UNKONWN
HYPOTHESIS TESTING n large σ unknown x−N 0
TWO TAIL TEST-CRITICAL VALUE APPROACH z test statistic= , L=−z α , U= z α if ztest statistic
e 2
2
null sample mean= population mean
is between [L,U] don’t reject null if not reject null
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown x−N 0
RIGHT TAIL -CRITICAL VALUE APPROACH z test statistic= , U= z α if ztest statistic is <U don’t
e
null sample mean= population mean reject null if not reject null
alternative= greater than population mean
HYPOTHESIS TESTING n large σ unknown x−N 0
LEFT TAIL TEST-CRITICAL VALUE APPROACH z test statistic= , L=- z α if ztest statistic is >L don’t
e
null sample mean= population mean reject null if not reject null
alternative= Less than population mean
HYPOTHESIS TESTING n large σ unknown p=NORM.DIST(1-ZTESTT STATISTIC)*2
TWO TAIL TEST-P VALUE APPROACH If p> α don’t reject else reject
null sample mean= population mean
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown p=NORM.DIST(1-ZTESTT STATISTIC)
RIGHT TAIL TEST-P VALUE APPROACH If p> α don’t reject else reject
null sample mean= population mean
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown p=NORM.DIST(1-ZTESTT STATISTIC)
LEFT TAIL TEST-P VALUE APPROACH If p> α don’t reject else reject
null sample mean= population mean
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown z α ∗σ z α ∗σ
TWO TAIL TEST-CL APPROACH 2 2
U =x+ , L=x−
null sample mean= population mean √n √n
alternative= not equal to population mean If population mean between [U,L] don’t reject else reject
HYPOTHESIS TESTING - T TEST SAMPLE SIZE SMALL SIGMA UNKNOWN
HYPOTHESIS TESTING n large σ unknown x−N 0 zα
TWO TAIL TEST-CRITICAL VALUE APPROACH T test statistic= e , L=−z α , U= 2 if ztest
2
null sample mean= population mean
statistic is between [L,U] don’t reject null if not reject null
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown x−N 0
RIGHT TAIL -CRITICAL VALUE APPROACH T test statistic= , U= z α if ztest statistic is <U
e
null sample mean= population mean don’t reject null if not reject null
alternative= greater than population mean
HYPOTHESIS TESTING n large σ unknown x−N 0
LEFT TAIL TEST-CRITICAL VALUE APPROACH T test statistic= e , L=- z α if ztest statistic is >L
null sample mean= population mean don’t reject null if not reject null
alternative= Less than population mean
HYPOTHESIS TESTING n large σ unknown p=T.DIST(1-ZTESTT STATISTIC)*2
TWO TAIL TEST-P VALUE APPROACH If p> α don’t reject else reject
null sample mean= population mean
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown p=T.DIST(1-ZTESTT STATISTIC)
RIGHT TAIL TEST-P VALUE APPROACH If p> α don’t reject else reject
null sample mean= population mean
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown p=-T.DIST(1-ZTESTT STATISTIC)
LEFT TAIL TEST-P VALUE APPROACH If p> α don’t reject else reject
null sample mean= population mean
alternative= not equal to population mean
HYPOTHESIS TESTING n large σ unknown z α ∗σ z α ∗σ
TWO TAIL TEST-CL APPROACH 2 2
U =x+ , L=x−
null sample mean= population mean √n √n
alternative= not equal to population mean If population mean between [U,L] don’t reject else reject
Keyboard shortcut
Alt A + V + V Data validation dialogue box
Alt + A + N =SUM(Sheer1:Sheet2!A2) Data consolidation
Alt +A + E Text to column visit for

Cursor rewiew Insert Page up


Screen copy Insert F9 for start mark Insert F10+ F10 end mark F10 twice
for ocpy
Navigation commands + insert in this mode Insert + Shift + up arrow read selected text
Insert + C to read copied text
Object review
Practical 4 5 7 8 9 11(excel part) 12(finishing pending work) 14(mock prac) 15 17 18 19 20 21 23 24 25
Name of the Course: B.A(Hons.) CBCS(LOCF)
Subject: Economics
Semester :III
Name of the paper: Data Analysis(SEC)
Unique Paper Code: 12273303

Time Allowed : 90 minutes Max. Marks: 10

Very Important: Please write your roll number  and the details mentioned above(Name of the course etc.) in the
first set of rows of  your worksheet and name your worksheet as your roll number. 

The data in the attached EXCEL sheet contains information related to cars that were part of an inventory of a used
car dealership on 31/03/2020. The variables included are year(of  manufacturing), price ($), mileage(the number of
miles the car has travelled) and fuel economy(mpg: miles per gallon).
1. Add “Age”  (in  years in 2020)  in a new column.
2. Visualise price and mileage on an appropriate graph with proper labels and titles . What is your inference
from this graph?
3. Find the correlation between price and mileage.
4. Put an appropriate formula/function to get the price of a car in the cell I3 for any car number that is typed
in H3 i.e. if we type a car number(say Car-23) in H3 we should get it’s price in I3
5. Import this data in R and use R commands to find the coefficient of skewness for the distribution of the
variable “Price”
6. Compare the distributions of prices 2000 made cars with 2014 made cars using boxplots. Label your charts
properly. 

Write the functions used in Excel and the commands used in R in your answer scripts.

1. Identify blanks and remove entire row with blanks

2. Check for typos in "Region" and "Product sector" using Vlookup


3. Check if all values of brand value are numers and positive
4. Find the average Brand value change. Do you observe a problem? Try to fix it.
5. What is the average brand value of technology firms and average of funancial firms

You might also like