Report PSA Assessement

PSA_ASSESMENT
VARUN MATHUR, 28954114
16 October 2018
——————————————————————————————–
Please Install the following packages when running on your PC. Please uncomment the
following statements while running
install.packages(“sqldf”)
install.packages(“ggplot”)
install.packages(“dplyr”)
install.packages(“plotrix”)
——————————————————————————————–
#Loading the packages
#NOTE: Please uncomment when loading the packages.
library("dplyr")
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':

##
## filter, lag
## The following objects are masked from 'package:base':

##
## intersect, setdiff, setequal, union
library("sqldf")
## Warning: package 'sqldf' was built under R version 3.4.4

## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.4
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.4
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.4.4
library("plotrix")
## Warning: package 'plotrix' was built under R version 3.4.4
##NOTE: Kindly note, I have read the file as .csv and NOT .xlsx
#Reading the CSV file and storing it in a variable "MyData"

MyData <- read.csv(file="PCOR File.csv", header=TRUE, sep=",")
# Checking the summary of the data

#summary(MyData)
Checking few rows of our dataframe

head(MyData)
## ï..Hospital Dateofdiagnosis Indicator_X

## 1 A 06-01-2016 1
## 2 A 13-01-2016 1
## 3 A 13-01-2016 1
## 4 A 17-01-2016 0
## 5 A 22-01-2016 1
## 6 A 18-02-2016 0
Checking if there are any NULL or empty values in any of the columns
sum(is.na(MyData$Indicator_X))
## [1] 0
#No Null values in the Indicator_X column
sum(is.na(MyData$Dateofdiagnosis))
## [1] 0
#No Null values in the Dateofdiagnosis column
sum(is.na(MyData$ï..Hospital))
## [1] 0
#No Null values in the Hospital column
Renaming the hospital column

colnames(MyData)[1] <- "Hospital"
We shall run some SQL queries to get some idea about the data. This
will help in analysing the data
#The below query shows the number of indicators where the cancer was
#diagnosed and documented
sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1
GROUP BY Hospital")
## Hospital count((Indicator_X))
## 1 A 81
## 2 B 578
## 3 C 130
## 4 D 164
## 5 E 700
## 6 F 409
## 7 G 22
## 8 H 288
## 9 I 438
## 10 L 39
## 11 M 828
## 12 N 601
## 13 O 220
## 14 P 308
After running the above query we see that Hospital A and Hospital G have the
minimum number of records for Cancer diagnosed for a particular date
Now Checking on average how many 0’s and 1’s are present in the
Indicator_X column for all hospitals and plotting in a histogram
hist(MyData$Indicator_X,
xlab="Indicator_X",
main="HISTOGRAM FOR PSA ASSESMENTS",
border="blue",
col="yellow",
xlim=c(0,1),
las=1,
breaks=5)
From the above plot, we see that in general the number of PSA Assesments
completed is more than the incomplete PSA Assesments
Count for:
indicaror x for 0= 3358
indicaror x for 1= 4806
Density plot for the Indicator_X to check how densely the 2 indicators
are populated
##Defining the themes to make the graphs look better.
red.bold.italic.text <- element_text(face = "bold.italic", color =
"red",size=15)
blue.bold.italic.text <- element_text(face = "bold.italic", color =
"blue",size=15)
ggplot() + geom_density(aes(x=MyData$Indicator_X),colour="blue",fill="gray")+
labs(title = "PSA ASSESMENT", x = "Indicator_X")+theme(title =
red.bold.italic.text, axis.title = blue.bold.italic.text) +
theme(plot.title = element_text(hjust = 0.5))
From the above plot, we see that the density of PSA assesments
completed(Indicator_X = 1) is much more than than the non-
completed PSA assesments (Indicator_X = 0)
Calculating the averages of the Indicator_X by grouping Indicator_X
with Hospital
ag1 <- aggregate(MyData$Indicator_X==0 ~ MyData$Hospital, FUN = mean, data =
MyData)
ag1
## MyData$Hospital MyData$Indicator_X == 0
## 1 A 0.4527027
## 2 B 0.2444444
## 3 C 0.3193717
## 4 D 0.4184397
## 5 E 0.5694957
## 6 F 0.2735346
## 7 G 0.4500000
## 8 H 0.4396887
## 9 I 0.3240741
## 10 L 0.3809524
## 11 M 0.3925165
## 12 N 0.4445471
## 13 O 0.5546559
## 14 P 0.2000000
ag2<- aggregate(MyData$Indicator_X==1 ~ MyData$Hospital, FUN = mean, data =

MyData)
ag2
## MyData$Hospital MyData$Indicator_X == 1
## 1 A 0.5472973
## 2 B 0.7555556
## 3 C 0.6806283
## 4 D 0.5815603
## 5 E 0.4305043
## 6 F 0.7264654
## 7 G 0.5500000
## 8 H 0.5603113
## 9 I 0.6759259
## 10 L 0.6190476
## 11 M 0.6074835
## 12 N 0.5554529
## 13 O 0.4453441
## 14 P 0.8000000
In the output, we se that the the averages for the completion of PSA assesment
(Indicator_X=1) is less in Hospital A compared to the other hospitals.
## Checking for each hospital and for each date of Diagnosis the average of
the non-completion of PSA assesment (Indicator_X=0)
ag3 <- aggregate(MyData$Indicator_X==0 ~
MyData$Dateofdiagnosis+MyData$Hospital, FUN = mean, data = MyData)
#ag3
## Checking for each hospital and for each date of Diagnosis the average of
the completion of PSA
#assesment (Indicator_X=1)
ag4 <- aggregate(MyData$Indicator_X==1 ~
MyData$Dateofdiagnosis+MyData$Hospital, FUN = mean, data = MyData)
#ag4
Counting the number of PSA assesments completed and not
completed in each of the hospitals
# counting number of 0's and 1's in the Indicator_X column for Hospital A

and Hospital=='A' GROUP BY Hospital")
## 1 A 81

and Hospital=='A' GROUP BY Hospital")
## 1 A 67
#No of 0's : 67
#No of 1's : 81
# counting number of 0's and 1's for Hospital B

and Hospital=='B' GROUP BY Hospital")
## 1 B 578

and Hospital=='B' GROUP BY Hospital")
## 1 B 187
#No of 0's : 187

#No of 1's : 578
# counting number of 0's and 1's for Hospital C

and Hospital=='C' GROUP BY Hospital")
## 1 C 130

and Hospital=='C' GROUP BY Hospital")
## 1 C 61
#No of 0's : 130
#No of 1's : 61
# counting number of 0's and 1's for Hospital D

and Hospital=='D' GROUP BY Hospital")
## 1 D 164

and Hospital=='D' GROUP BY Hospital")
## 1 D 118
#No of 0's : 118

#No of 1's : 164
# counting number of 0's and 1's for Hospital E

and Hospital=='E' GROUP BY Hospital")
## 1 E 700

and Hospital=='E' GROUP BY Hospital")
## 1 E 926
#No of 0's : 926

#No of 1's : 700
# counting number of 0's and 1's for Hospital F

and Hospital=='F' GROUP BY Hospital")
## 1 F 409

## 1 F 154
#No of 0's : 154
#No of 1's : 409
# counting number of 0's and 1's for Hospital G

and Hospital=='G' GROUP BY Hospital")
## 1 G 22

## 1 G 18
#No of 0's : 18
#No of 1's : 22
# counting number of 0's and 1's for Hospital F

## 1 F 409

## 1 F 154
#No of 0's : 154

#No of 1's : 409
# counting number of 0's and 1's for Hospital G

## 1 G 22

## 1 G 18
#No of 0's : 18
#No of 1's : 22
# counting number of 0's and 1's for Hospital H

and Hospital=='H' GROUP BY Hospital")
## 1 H 288

and Hospital=='H' GROUP BY Hospital")
## 1 H 226
#No of 0's : 226

#No of 1's : 288
# counting number of 0's and 1's for Hospital I

and Hospital=='I' GROUP BY Hospital")
## 1 I 438

and Hospital=='I' GROUP BY Hospital")
## 1 I 210
#No of 0's : 210

#No of 1's : 438
# counting number of 0's and 1's for Hospital L

and Hospital=='L' GROUP BY Hospital")
## 1 L 39

and Hospital=='L' GROUP BY Hospital")
## 1 L 24
#No of 0's : 24
#No of 1's : 39
# counting number of 0's and 1's for Hospital M

and Hospital=='M' GROUP BY Hospital")
## 1 M 828

and Hospital=='M' GROUP BY Hospital")
## 1 M 535
#No of 0's : 535

#No of 1's : 828
# counting number of 0's and 1's for Hospital N

and Hospital=='N' GROUP BY Hospital")
## 1 N 601

and Hospital=='N' GROUP BY Hospital")
## 1 N 481
#No of 0's : 481

#No of 1's : 601
# counting number of 0's and 1's for Hospital O

and Hospital=='O' GROUP BY Hospital")
## 1 O 220

and Hospital=='O' GROUP BY Hospital")
## 1 O 274
#No of 0's : 274
#No of 1's : 220
# counting number of 0's and 1's for Hospital P

and Hospital=='P' GROUP BY Hospital")
## 1 P 308

and Hospital=='P' GROUP BY Hospital")
## 1 P 77
#No of 0's : 77
#No of 1's : 308
FOR ALL ASSESMENTS THAT ARE INCOMPLETE(Indicator_X=0)

## Now we filter out the data.
##All the data with the Non-completed PSA assesment is taken into a new
dataframe called "cancer_NOT_diagnosed"
cancer_NOT_diagnosed <- MyData %>%

filter(Indicator_X==0)
## Warning: package 'bindrcpp' was built under R version 3.4.4
##Checking the data

#cancer_NOT_diagnosed
##Converting the data type of the column "Dateofdiagnosis" to dateType

keepinng the same format.
# This will be useful for grouping the data together. A new column is created
which will contain
# these dates.
new_date_df<- cancer_NOT_diagnosed %>%
mutate(Dateofdiagnosis_for_cancer = as.Date(Dateofdiagnosis, format = "%d-
%m-%Y"))
#Checking the new dataframe

#new_date_df
## Definig the themes for better looking graphs.

red.bold.italic.text <- element_text(face = "bold.italic", color = "red")
blue.bold.italic.text <- element_text(face = "bold.italic", color = "blue")
## Now we plot a point graph depicting the Non-completness of the PSA
assesment.
# The dates are grouped based on the year and are shown on the x-axis and the
corresponding y-axis shows the different hospitals.
new_date_df%>%ggplot(aes(x = Dateofdiagnosis_for_cancer, y = Hospital,fill =

Indicator_X)) +
geom_point(color = "green",stat = "identity",size=1)+ labs(title = "CANCER
DIAGNOSIS", x = "Year Diagnosed", y = "Hospital")+
theme(title = red.bold.italic.text, axis.title = blue.bold.italic.text) +
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))
From the above plot, we see that in general, the Hospital A has relatively low
non-completness of the PSA assesment as compared to Hospital B, E, I, M, N, O.
FOR ALL ASSESMENTS THAT ARE COMPLETED(Indicator_X=1)

## Now we filter out the data.
##All the data with the Non-completed PSA assesment is taken into a new
dataframe called "cancer_diagnosed"
cancer_diagnosed <- MyData %>%

filter(Indicator_X==1)
##Checking the data

#cancer_diagnosed
##Converting the data type of the column "Dateofdiagnosis" to dateType

keepinng the same format.
# This will be useful for grouping the data together. A new column is created
which will contain
# these dates.
new_date_df<- cancer_diagnosed %>%
mutate(Dateofdiagnosis_for_cancer = as.Date(Dateofdiagnosis, format = "%d-
%m-%Y"))
#Checking the new dataframe

#new_date_df
## Definig the themes for better looking graphs.

red.bold.italic.text <- element_text(face = "bold.italic", color =
"red",size=15)
blue.bold.italic.text <- element_text(face = "bold.italic", color =
"blue",size=15)
## Now we plot a point graph depicting the Non-completness of the PSA

assesment.
# The dates are grouped based on the year and are shown on the x-axis and the
corresponding
## y-axis shows the different hospitals.
new_date_df%>%ggplot(aes(x = Dateofdiagnosis_for_cancer, y = Hospital,fill =
Indicator_X)) +
geom_point(color = "green",stat = "identity",size=1)+ labs(title = "CANCER
DIAGNOSIS", x = "Year Diagnosed", y = "Hospital")+
theme(title = red.bold.italic.text, axis.title = blue.bold.italic.text) +
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))
From the above plot, we see that in general, the Hospital A has relatively low
ratio of completness of the PSA assesment as compared to Hospital B, E, I, H,M,
N, O,P.
CREATING PIE CHART for completed PSA assessment(Indicator_X=1)

for each hospital.
# Storing all the unique hospitals in a vector
Unq_hospitals<-c("A","B","C","D","E","F","G","H","I","L","M","N","O","P")
Grouping the data for individual hospitals with their completed PSA
assessments.
grp1= MyData %>%
filter(Indicator_X==1 & Hospital=='A')
#81 obs
grp2= MyData %>%

filter(Indicator_X==1 & Hospital=='B')
#578
grp3= MyData %>%

filter(Indicator_X==1 & Hospital=='C')
#130 obs
grp4= MyData %>%

filter(Indicator_X==1 & Hospital=='D')
#164 Obs
grp5= MyData %>%

filter(Indicator_X==1 & Hospital=='E')
#700 obs
grp6= MyData %>%

filter(Indicator_X==1 & Hospital=='F')
#409 Obs
grp7= MyData %>%

filter(Indicator_X==1 & Hospital=='G')
#22 Obs
grp8= MyData %>%

filter(Indicator_X==1 & Hospital=='H')
#288 Obs
grp9= MyData %>%

filter(Indicator_X==1 & Hospital=='I')
#438 Obs
grp10= MyData %>%

filter(Indicator_X==1 & Hospital=='L')
# 39 0bs
grp11= MyData %>%

filter(Indicator_X==1 & Hospital=='M')
#828 Obs
grp12= MyData %>%

filter(Indicator_X==1 & Hospital=='N')
#601 Obs
grp13= MyData %>%

filter(Indicator_X==1 & Hospital=='O')
#220 Obs
grp14= MyData %>%

filter(Indicator_X==1 & Hospital=='P')
#308 Obs
#Storing all the observations in a new vector

diagnosed<-c(81,578,130,164,700,409,22,288,438,39,39,828,601,220,308)
#Plotting the Pie chart

pie3D(diagnosed,labels=Unq_hospitals,explode=0.1,
main=" PSA ASSESSMENT COMPLETED")
From the pie chart we can see that Hospital E, N, O, B have way better ratio of
the completed PSA assessments as compared to the Hospital A.
CREATING PIE CHART for incomplete PSA assessment(Indicator_X=0)

for each hospital.
grp1= MyData %>%
filter(Indicator_X==0 & Hospital=='A')
#67 obs
grp2= MyData %>%

filter(Indicator_X==0 & Hospital=='B')
#187 obs
grp3= MyData %>%

filter(Indicator_X==0 & Hospital=='C')
#61 obs
grp4= MyData %>%

filter(Indicator_X==0 & Hospital=='D')
#118 Obs
grp5= MyData %>%

filter(Indicator_X==0 & Hospital=='E')
#926 obs
grp6= MyData %>%

filter(Indicator_X==0 & Hospital=='F')
#154 Obs
grp7= MyData %>%

filter(Indicator_X==0 & Hospital=='G')
#18 Obs
grp8= MyData %>%

filter(Indicator_X==0 & Hospital=='H')
#226 Obs
grp9= MyData %>%

filter(Indicator_X==0 & Hospital=='I')
#210 Obs
grp10= MyData %>%

filter(Indicator_X==0 & Hospital=='L')
# 24 0bs
grp11= MyData %>%

filter(Indicator_X==0 & Hospital=='M')
#535 Obs
grp12= MyData %>%

filter(Indicator_X==0 & Hospital=='N')
#481 Obs
grp13= MyData %>%

filter(Indicator_X==0 & Hospital=='O')
#274 Obs
grp14= MyData %>%

filter(Indicator_X==0 & Hospital=='P')
#77 Obs
#Storing these observations in a new vector

diagnosed<-c(67,187,61,118,926,154,18,226,210,24,535,481,274,77)
#Plotting the Pie chart

pie3D(diagnosed,labels=Unq_hospitals,explode=0.1,
main="INCOMPLETE PSA ASSESSMENT")
From the above graph we see that for Hospital A the incompleteness of the PSA
assessment is quite less as compared to the other hospitals. Hospital E has a
large ratio of incompleteness of the PSA assessment, followed by hospial M and
hospital N. However Hospital Aand Hospital B are quite comparable according
to the stats above.

Report PSA Assessement

Uploaded by

Document Information

Copyright

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Report PSA Assessement

Uploaded by

Copyright:

PSA_ASSESMENT

VARUN MATHUR, 28954114

#NOTE: Please uncomment when loading the packages.

## Warning: package 'dplyr' was built under R version 3.4.4

## The following objects are masked from 'package:stats':

## The following objects are masked from 'package:base':

## Warning: package 'sqldf' was built under R version 3.4.4

## Warning: package 'gsubfn' was built under R version 3.4.4

## Loading required package: proto

## Loading required package: RSQLite

## Warning: package 'RSQLite' was built under R version 3.4.4

## Warning: package 'ggplot2' was built under R version 3.4.4

## Warning: package 'plotrix' was built under R version 3.4.4

#Reading the CSV file and storing it in a variable "MyData"

# Checking the summary of the data

Checking few rows of our dataframe

## ï..Hospital Dateofdiagnosis Indicator_X

#No Null values in the Indicator_X column

#No Null values in the Hospital column

Renaming the hospital column

ag2<- aggregate(MyData$Indicator_X==1 ~ MyData$Hospital, FUN = mean, data =

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

# counting number of 0's and 1's for Hospital B

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

#No of 0's : 187

# counting number of 0's and 1's for Hospital C

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

# counting number of 0's and 1's for Hospital D

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

#No of 0's : 118

# counting number of 0's and 1's for Hospital E

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

#No of 0's : 926

# counting number of 0's and 1's for Hospital F

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

# counting number of 0's and 1's for Hospital G

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

# counting number of 0's and 1's for Hospital F

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

#No of 0's : 154

# counting number of 0's and 1's for Hospital G

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

# counting number of 0's and 1's for Hospital H

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

#No of 0's : 226

# counting number of 0's and 1's for Hospital I

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

#No of 0's : 210

# counting number of 0's and 1's for Hospital L

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

# counting number of 0's and 1's for Hospital M

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==1

sqldf("select Hospital, count((Indicator_X)) from MyData where Indicator_X==0

#No of 0's : 535

# counting number of 0's and 1's for Hospital N