You are on page 1of 21

PSA_ASSESMENT

VARUN MATHUR, 28954114

16 October 2018

——————————————————————————————–
Please Install the following packages when running on your PC. Please uncomment the
following statements while running

install.packages(“sqldf”)

install.packages(“ggplot”)

install.packages(“dplyr”)

install.packages(“plotrix”)
——————————————————————————————–
#Loading the packages

#NOTE: Please uncomment when loading the packages.

library("dplyr")

## Warning: package 'dplyr' was built under R version 3.4.4

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union

library("sqldf")

## Warning: package 'sqldf' was built under R version 3.4.4


## Loading required package: gsubfn

## Warning: package 'gsubfn' was built under R version 3.4.4

## Loading required package: proto

## Loading required package: RSQLite

## Warning: package 'RSQLite' was built under R version 3.4.4

library("ggplot2")

## Warning: package 'ggplot2' was built under R version 3.4.4

library("plotrix")

## Warning: package 'plotrix' was built under R version 3.4.4

##NOTE: Kindly note, I have read the file as .csv and NOT .xlsx

#Reading the CSV file and storing it in a variable "MyData"


MyData <- read.csv(file="PCOR File.csv", header=TRUE, sep=",")

# Checking the summary of the data


#summary(MyData)

Checking few rows of our dataframe


head(MyData)

## ï..Hospital Dateofdiagnosis Indicator_X


## 1 A 06-01-2016 1
## 2 A 13-01-2016 1
## 3 A 13-01-2016 1
## 4 A 17-01-2016 0
## 5 A 22-01-2016 1
## 6 A 18-02-2016 0

Checking if there are any NULL or empty values in any of the columns
sum(is.na(MyData$Indicator_X))

## [1] 0

#No Null values in the Indicator_X column

sum(is.na(MyData$Dateofdiagnosis))

## [1] 0
#No Null values in the Dateofdiagnosis column

sum(is.na(MyData$ï..Hospital))

## [1] 0

#No Null values in the Hospital column

Renaming the hospital column


colnames(MyData)[1] <- "Hospital"

We shall run some SQL queries to get some idea about the data. This
will help in analysing the data
#The below query shows the number of indicators where the cancer was
#diagnosed and documented
sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1
GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 A 81
## 2 B 578
## 3 C 130
## 4 D 164
## 5 E 700
## 6 F 409
## 7 G 22
## 8 H 288
## 9 I 438
## 10 L 39
## 11 M 828
## 12 N 601
## 13 O 220
## 14 P 308

After running the above query we see that Hospital A and Hospital G have the
minimum number of records for Cancer diagnosed for a particular date

Now Checking on average how many 0’s and 1’s are present in the
Indicator_X column for all hospitals and plotting in a histogram
hist(MyData$Indicator_X,
xlab="Indicator_X",
main="HISTOGRAM FOR PSA ASSESMENTS",
border="blue",
col="yellow",
xlim=c(0,1),
las=1,
breaks=5)
From the above plot, we see that in general the number of PSA Assesments
completed is more than the incomplete PSA Assesments
Count for:
indicaror x for 0= 3358
indicaror x for 1= 4806

Density plot for the Indicator_X to check how densely the 2 indicators
are populated
##Defining the themes to make the graphs look better.
red.bold.italic.text <- element_text(face = "bold.italic", color =
"red",size=15)
blue.bold.italic.text <- element_text(face = "bold.italic", color =
"blue",size=15)

ggplot() + geom_density(aes(x=MyData$Indicator_X),colour="blue",fill="gray")+
labs(title = "PSA ASSESMENT", x = "Indicator_X")+theme(title =
red.bold.italic.text, axis.title = blue.bold.italic.text) +
theme(plot.title = element_text(hjust = 0.5))

From the above plot, we see that the density of PSA assesments
completed(Indicator_X = 1) is much more than than the non-
completed PSA assesments (Indicator_X = 0)
Calculating the averages of the Indicator_X by grouping Indicator_X
with Hospital
ag1 <- aggregate(MyData$Indicator_X==0 ~ MyData$Hospital, FUN = mean, data =
MyData)
ag1

## MyData$Hospital MyData$Indicator_X == 0
## 1 A 0.4527027
## 2 B 0.2444444
## 3 C 0.3193717
## 4 D 0.4184397
## 5 E 0.5694957
## 6 F 0.2735346
## 7 G 0.4500000
## 8 H 0.4396887
## 9 I 0.3240741
## 10 L 0.3809524
## 11 M 0.3925165
## 12 N 0.4445471
## 13 O 0.5546559
## 14 P 0.2000000

ag2<- aggregate(MyData$Indicator_X==1 ~ MyData$Hospital, FUN = mean, data =


MyData)
ag2

## MyData$Hospital MyData$Indicator_X == 1
## 1 A 0.5472973
## 2 B 0.7555556
## 3 C 0.6806283
## 4 D 0.5815603
## 5 E 0.4305043
## 6 F 0.7264654
## 7 G 0.5500000
## 8 H 0.5603113
## 9 I 0.6759259
## 10 L 0.6190476
## 11 M 0.6074835
## 12 N 0.5554529
## 13 O 0.4453441
## 14 P 0.8000000

In the output, we se that the the averages for the completion of PSA assesment
(Indicator_X=1) is less in Hospital A compared to the other hospitals.
## Checking for each hospital and for each date of Diagnosis the average of
the non-completion of PSA assesment (Indicator_X=0)
ag3 <- aggregate(MyData$Indicator_X==0 ~
MyData$Dateofdiagnosis+MyData$Hospital, FUN = mean, data = MyData)
#ag3

## Checking for each hospital and for each date of Diagnosis the average of
the completion of PSA
#assesment (Indicator_X=1)
ag4 <- aggregate(MyData$Indicator_X==1 ~
MyData$Dateofdiagnosis+MyData$Hospital, FUN = mean, data = MyData)
#ag4
Counting the number of PSA assesments completed and not
completed in each of the hospitals
# counting number of 0's and 1's in the Indicator_X column for Hospital A

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='A' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 A 81

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='A' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 A 67

#No of 0's : 67
#No of 1's : 81

# counting number of 0's and 1's for Hospital B

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='B' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 B 578

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='B' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 B 187

#No of 0's : 187


#No of 1's : 578

# counting number of 0's and 1's for Hospital C

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='C' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 C 130

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='C' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 C 61
#No of 0's : 130
#No of 1's : 61

# counting number of 0's and 1's for Hospital D

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='D' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 D 164

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='D' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 D 118

#No of 0's : 118


#No of 1's : 164

# counting number of 0's and 1's for Hospital E

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='E' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 E 700

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='E' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 E 926

#No of 0's : 926


#No of 1's : 700

# counting number of 0's and 1's for Hospital F

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='F' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 F 409

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='F' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 F 154
#No of 0's : 154
#No of 1's : 409

# counting number of 0's and 1's for Hospital G

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='G' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 G 22

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='G' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 G 18

#No of 0's : 18
#No of 1's : 22

# counting number of 0's and 1's for Hospital F

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='F' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 F 409

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='F' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 F 154

#No of 0's : 154


#No of 1's : 409

# counting number of 0's and 1's for Hospital G

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='G' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 G 22

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='G' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 G 18
#No of 0's : 18
#No of 1's : 22

# counting number of 0's and 1's for Hospital H

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='H' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 H 288

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='H' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 H 226

#No of 0's : 226


#No of 1's : 288

# counting number of 0's and 1's for Hospital I

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='I' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 I 438

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='I' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 I 210

#No of 0's : 210


#No of 1's : 438

# counting number of 0's and 1's for Hospital L

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='L' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 L 39

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='L' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 L 24
#No of 0's : 24
#No of 1's : 39

# counting number of 0's and 1's for Hospital M

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='M' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 M 828

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='M' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 M 535

#No of 0's : 535


#No of 1's : 828

# counting number of 0's and 1's for Hospital N

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='N' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 N 601

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='N' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 N 481

#No of 0's : 481


#No of 1's : 601

# counting number of 0's and 1's for Hospital O

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='O' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 O 220

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='O' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 O 274
#No of 0's : 274
#No of 1's : 220

# counting number of 0's and 1's for Hospital P

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1


and Hospital=='P' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 P 308

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0


and Hospital=='P' GROUP BY Hospital")

## Hospital count((Indicator_X))
## 1 P 77

#No of 0's : 77
#No of 1's : 308

FOR ALL ASSESMENTS THAT ARE INCOMPLETE(Indicator_X=0)


## Now we filter out the data.
##All the data with the Non-completed PSA assesment is taken into a new
dataframe called "cancer_NOT_diagnosed"

cancer_NOT_diagnosed <- MyData %>%


filter(Indicator_X==0)

## Warning: package 'bindrcpp' was built under R version 3.4.4

##Checking the data


#cancer_NOT_diagnosed

##Converting the data type of the column "Dateofdiagnosis" to dateType


keepinng the same format.
# This will be useful for grouping the data together. A new column is created
which will contain
# these dates.
new_date_df<- cancer_NOT_diagnosed %>%
mutate(Dateofdiagnosis_for_cancer = as.Date(Dateofdiagnosis, format = "%d-
%m-%Y"))

#Checking the new dataframe


#new_date_df

## Definig the themes for better looking graphs.


red.bold.italic.text <- element_text(face = "bold.italic", color = "red")
blue.bold.italic.text <- element_text(face = "bold.italic", color = "blue")
## Now we plot a point graph depicting the Non-completness of the PSA
assesment.

# The dates are grouped based on the year and are shown on the x-axis and the
corresponding y-axis shows the different hospitals.

new_date_df%>%ggplot(aes(x = Dateofdiagnosis_for_cancer, y = Hospital,fill =


Indicator_X)) +
geom_point(color = "green",stat = "identity",size=1)+ labs(title = "CANCER
DIAGNOSIS", x = "Year Diagnosed", y = "Hospital")+
theme(title = red.bold.italic.text, axis.title = blue.bold.italic.text) +
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))
From the above plot, we see that in general, the Hospital A has relatively low
non-completness of the PSA assesment as compared to Hospital B, E, I, M, N, O.

FOR ALL ASSESMENTS THAT ARE COMPLETED(Indicator_X=1)


## Now we filter out the data.
##All the data with the Non-completed PSA assesment is taken into a new
dataframe called "cancer_diagnosed"

cancer_diagnosed <- MyData %>%


filter(Indicator_X==1)

##Checking the data


#cancer_diagnosed

##Converting the data type of the column "Dateofdiagnosis" to dateType


keepinng the same format.
# This will be useful for grouping the data together. A new column is created
which will contain
# these dates.
new_date_df<- cancer_diagnosed %>%
mutate(Dateofdiagnosis_for_cancer = as.Date(Dateofdiagnosis, format = "%d-
%m-%Y"))

#Checking the new dataframe


#new_date_df

## Definig the themes for better looking graphs.


red.bold.italic.text <- element_text(face = "bold.italic", color =
"red",size=15)
blue.bold.italic.text <- element_text(face = "bold.italic", color =
"blue",size=15)

## Now we plot a point graph depicting the Non-completness of the PSA


assesment.
# The dates are grouped based on the year and are shown on the x-axis and the
corresponding
## y-axis shows the different hospitals.
new_date_df%>%ggplot(aes(x = Dateofdiagnosis_for_cancer, y = Hospital,fill =
Indicator_X)) +
geom_point(color = "green",stat = "identity",size=1)+ labs(title = "CANCER
DIAGNOSIS", x = "Year Diagnosed", y = "Hospital")+
theme(title = red.bold.italic.text, axis.title = blue.bold.italic.text) +
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))
From the above plot, we see that in general, the Hospital A has relatively low
ratio of completness of the PSA assesment as compared to Hospital B, E, I, H,M,
N, O,P.

CREATING PIE CHART for completed PSA assessment(Indicator_X=1)


for each hospital.
# Storing all the unique hospitals in a vector
Unq_hospitals<-c("A","B","C","D","E","F","G","H","I","L","M","N","O","P")

Grouping the data for individual hospitals with their completed PSA
assessments.
grp1= MyData %>%
filter(Indicator_X==1 & Hospital=='A')
#81 obs

grp2= MyData %>%


filter(Indicator_X==1 & Hospital=='B')
#578

grp3= MyData %>%


filter(Indicator_X==1 & Hospital=='C')
#130 obs

grp4= MyData %>%


filter(Indicator_X==1 & Hospital=='D')
#164 Obs

grp5= MyData %>%


filter(Indicator_X==1 & Hospital=='E')
#700 obs

grp6= MyData %>%


filter(Indicator_X==1 & Hospital=='F')
#409 Obs

grp7= MyData %>%


filter(Indicator_X==1 & Hospital=='G')
#22 Obs

grp8= MyData %>%


filter(Indicator_X==1 & Hospital=='H')
#288 Obs

grp9= MyData %>%


filter(Indicator_X==1 & Hospital=='I')
#438 Obs

grp10= MyData %>%


filter(Indicator_X==1 & Hospital=='L')
# 39 0bs

grp11= MyData %>%


filter(Indicator_X==1 & Hospital=='M')
#828 Obs

grp12= MyData %>%


filter(Indicator_X==1 & Hospital=='N')
#601 Obs

grp13= MyData %>%


filter(Indicator_X==1 & Hospital=='O')
#220 Obs

grp14= MyData %>%


filter(Indicator_X==1 & Hospital=='P')
#308 Obs

#Storing all the observations in a new vector


diagnosed<-c(81,578,130,164,700,409,22,288,438,39,39,828,601,220,308)

#Plotting the Pie chart


pie3D(diagnosed,labels=Unq_hospitals,explode=0.1,
main=" PSA ASSESSMENT COMPLETED")
From the pie chart we can see that Hospital E, N, O, B have way better ratio of
the completed PSA assessments as compared to the Hospital A.

CREATING PIE CHART for incomplete PSA assessment(Indicator_X=0)


for each hospital.
grp1= MyData %>%
filter(Indicator_X==0 & Hospital=='A')
#67 obs

grp2= MyData %>%


filter(Indicator_X==0 & Hospital=='B')
#187 obs

grp3= MyData %>%


filter(Indicator_X==0 & Hospital=='C')
#61 obs

grp4= MyData %>%


filter(Indicator_X==0 & Hospital=='D')
#118 Obs

grp5= MyData %>%


filter(Indicator_X==0 & Hospital=='E')
#926 obs

grp6= MyData %>%


filter(Indicator_X==0 & Hospital=='F')
#154 Obs

grp7= MyData %>%


filter(Indicator_X==0 & Hospital=='G')
#18 Obs

grp8= MyData %>%


filter(Indicator_X==0 & Hospital=='H')
#226 Obs

grp9= MyData %>%


filter(Indicator_X==0 & Hospital=='I')
#210 Obs

grp10= MyData %>%


filter(Indicator_X==0 & Hospital=='L')
# 24 0bs

grp11= MyData %>%


filter(Indicator_X==0 & Hospital=='M')
#535 Obs

grp12= MyData %>%


filter(Indicator_X==0 & Hospital=='N')
#481 Obs

grp13= MyData %>%


filter(Indicator_X==0 & Hospital=='O')
#274 Obs

grp14= MyData %>%


filter(Indicator_X==0 & Hospital=='P')
#77 Obs

#Storing these observations in a new vector


diagnosed<-c(67,187,61,118,926,154,18,226,210,24,535,481,274,77)

#Plotting the Pie chart


pie3D(diagnosed,labels=Unq_hospitals,explode=0.1,
main="INCOMPLETE PSA ASSESSMENT")
From the above graph we see that for Hospital A the incompleteness of the PSA
assessment is quite less as compared to the other hospitals. Hospital E has a
large ratio of incompleteness of the PSA assessment, followed by hospial M and
hospital N. However Hospital Aand Hospital B are quite comparable according
to the stats above.

You might also like