Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 4

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Lachaise Assignment
% LaTeX Template
% Version 1.0 (26/6/2018)
%
% This template originates from:
% http://www.LaTeXTemplates.com
%
% Authors:
% Marion Lachaise & François Févotte
% Vel (vel@LaTeXTemplates.com)
%
% License:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%----------------------------------------------------------------------------------
------
% PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
%----------------------------------------------------------------------------------
------

\documentclass{article}

\input{structure.tex} % Include the file specifying the document structure and


custom commands

%----------------------------------------------------------------------------------
------
% ASSIGNMENT INFORMATION
%----------------------------------------------------------------------------------
------

\title{STAT8240: Assignment \#1} % Title of the assignment

\author{Yifan Zhang \\ \texttt{yzhang60@kennesaw.edu}} % Author name and email


address

\date{Kennesaw State University --- \today} % University, school and/or department


name(s) and a date

%----------------------------------------------------------------------------------
------

\begin{document}

\maketitle % Print the title

%----------------------------------------------------------------------------------
------
% INTRODUCTION
%----------------------------------------------------------------------------------
------

\begin{warn} % Information block


Please read through your syllabus regarding the assignment requirements and
how you should submit your assignment. The deadline for this assignment is \
textbf{\underline{Sep 21th at midnight}}. If you have any questions about the
assignment, feel free to drop me an email.
\end{warn}

%----------------------------------------------------------------------------------
------
% PROBLEM 1
%----------------------------------------------------------------------------------
------

\section{Missing Data} % Numbered section


\begin{info}
For Question 1 and Question 2, we will use the dataset "diabetes.csv". For
information related to this data, please see the readme file.
\end{info}
\begin{question}
Use "class" as the dependent variable (whether the observation has diabetes)
and the other variables as predictors. Answer the following questions:
\begin{enumerate}[(a)]
\item Using the raw data, run a linear regression analysis on "class"
with all predictors. Summarize what you find. Are there any effects that are
counter-intuitive?
\item Looking at the dataset description, is it normal to have a value
of "0" for variables such as blood pressure and insulin level? How do you explain
the zeros?
\item Run a regression analysis dropping all observations that have a
value of "0" in the following variables: plasma, bp, skin, insulin, body. Compare
your updated results with results obtained in a), what do you find?
\end{enumerate}
\end{question}

\begin{question}
For this question, we will switch gears and use R to impute the missing
values. Before answering the questions, first install the R package "mice". Use the
R code "mice.R" provided under the assignment folder (make changes as you see fit)
and answer the following questions:
% Subquestions numbered with letters
\begin{enumerate}[(a)]
\item Looking at the correlation among predictors, do you think it is a
good idea to use regression model of all predictors for missing value imputation?
Why or why not.
\item Use regression, Bayesian regression and predictive mean matching
as the imputation model, how does the correlation among predictors change after
imputation? What about regression results, are there any noticeable differences?
\end{enumerate}
\end{question}

\section{Maximum Likelihood Estimator}

\begin{question}
Let $y_1,y_2,\cdots,y_n$ be i.i.d samples from the following distributions.
For each of the following distributions, do the following: 1) check the wiki page
(your best friend) for the distribution if you are not familiar with it, 2) derive
the MLE of ALL unknown parameters and 3) check the second derivative to make sure
it is indeed the maximum.
% Subquestions numbered with letters
\begin{enumerate}[(a)]
\item Normal distribution with unknown mean $\mu$ and variance $\
sigma^2$, $N(\mu,\sigma^2)$:
\begin{equation}
f(y)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp(-\frac{(y-\mu)^2}{2\sigma^2})
\end{equation}
\item Exponential distribution with unknown rate $\lambda$, $\exp(\
lambda)$:
\begin{equation}
f(y)=\lambda \exp(-\lambda y)
\end{equation}
\item Poisson distribution (discrete) with unknown mean $\lambda$,
$Pois(\lambda)$:
\begin{equation}
P(Y_i=y_i)=\frac{\lambda^{y_i}\exp(-\lambda)}{y_i!}
\end{equation}
\item Binomial distribution with \textbf{known} number of trails $N$
and \textbf{unknown} success probability p, $Binom(N,p)$:
\begin{equation}
P(Y_i=y_i)=\frac{N!}{y_i!(N-y_i)!}p^{y_i}(1-p)^{N-y_i}
\end{equation}
\item (Optional) Let $y_1=y$ be the only sample from a binomial
distribution with \textbf{unknown} number of trails $N$ and \textbf{known} success
probability p, $Binom(N,p)$:

\textit{Hint: is N differentiable? Start with comparing the likelihood


ratio between $L(N=n+1|p,y)$ and $L(N=n|p,y)$}

\end{enumerate}
\end{question}

\begin{question}
In the file "tdist.csv", you will find 200 samples from a t-distribution with
one degrees of freedom and unknown location parameter $\mu$, $t(\mu,1)$:

\begin{enumerate}[(a)]
\item Given the probability density function of a t-distribution $t(\
mu,1)$:
\begin{equation}
f(y)=\frac{1}{\Gamma(\frac{1}{2})\sqrt{\pi}}(1+(y-\mu)^2)^{-1}.
\end{equation}
Write down the log-likelihood function of 200 samples, derive the first
and second derivative of the log-likelihood function with respect to $\mu$, can you
find $\hat{\mu} _{MLE}$ analytically?
\item Write a program using Newton-Raphson method to find $\hat{\mu}_{MLE}$
numerically.
\end{enumerate}
\end{question}

\begin{question}
In the file "probit.csv", you will find 400 samples generated from a probit
regression model:
\begin{equation}
y_i = \begin{cases} 0 &\mbox{if } \beta_0+x_i\beta_1+\epsilon_i<0\\
1 & \mbox{if } \beta_0+x_i\beta_1+\epsilon_i>0 \end{cases}, \quad \
epsilon_i\sim N(0,1).
\end{equation}
Write a program using Newton-Raphson method to find $\hat{\beta_0}$ and $\
hat{\beta_1}$ numerically.
\end{question}

\section{Standard Error and Bootstrap}


\begin{info}
For Question 6 and Question 7, we will use a technique known as "simulation"
to further your understanding of standard error and boostrap. Simulation typically
involves generating synthetic data with known data generating process (known
parameters) and evaluate how well estimators recover the true parameters.
\end{info}

\begin{question}
\begin{enumerate}[(a)]
\item The python code "sim\_regression.py" includes a function that
generates n samples from the following regression model:
\begin{equation}
y_i=\beta_0+\beta_1x_i+\epsilon_i, \quad \epsilon_i\sim N(0,\
sigma^2), \quad \beta_0=3,\beta_1=2,\sigma^2=1,
\end{equation}
\item Generate 1,000 different datasets with $n=20$. For each generated
data, find $\hat{\beta_1}$, calculate the standard error $SE(\hat{\beta_1})$ based
on asymptotic distribution and construct the 95\% confidence interval :
\begin{equation}
(\hat{\beta_1}-2SE(\hat{\beta_1}),\hat{\beta_1}+2SE(\hat{\beta_1}))
\end{equation}

Count the number of times the true parameter value $\beta_1$ is contained in
the 95\% CI.
\item Repeat the same exercise in a), but this time change
$n=50$.
\item Repeat the same exercise in a), but this time change
$n=300$.

\end{enumerate}
\end{question}

\begin{question}
Generate one dataset with $n=40$ using "sim\_regression.py".
\begin{enumerate}[(a)]
\item find $\hat{\beta_1}$, calculate the standard error $SE(\hat{\
beta_1})$ based on asymptotic distribution and construct the 95\% confidence
interval.
\item Calculate the standard error $SE(\hat{\beta_1})$ using bootstrap
and construct the 95\% confidence interval. For bootstrap, generate 100 bootstrap
samples by drawing $n=40$ samples from the generated dataset (with
replacement). Compare the 95\% confidence interval, which one do you prefer in this
case and why?

\end{enumerate}
\end{question}

%----------------------------------------------------------------------------------
------

\end{document}

You might also like