library(rgl) ## For interactive 3D plots
library(mvtnorm) ## For multivariate normal distributions
# ?dmvnorm ## check help for distribution of multivariate function
# dmvnorm ## type the function name to see how the author define the function in details.
library(ggplot2) ## For high-level R plots
Let’s first draw the density for the bivariate normal distribution with \(E(X)=0\), \(E(Y)=1\), \(var(X)=1\), \(var(Y)=4\), and \(Corr(X,Y)=0.5\). Note that \(Cov(X,Y)=0.5\times \sqrt{1}\times \sqrt{4}=1\).
CovXY=matrix(c(1,1,1,4),nrow=2)
x=seq(from=-3,to=3,by=0.1) ## range of X
y=seq(from=-7,to=7,by=0.1) ## range of Y
xy=cbind(x=rep(x,each=length(y)),y=rep(y,times=length(x))) ## pairs of (x,y) in matrix
fxy=dmvnorm(xy,mean=c(0,1),sigma=CovXY) ## a vector of joint density function at (x,y)
z=matrix(fxy,byrow=TRUE,nrow=length(x),ncol=length(y)) ## reshape to a matrix
persp3d(x,y,z,col='steelblue',alpha=0.7) ## 3D plot in rgl library, alpha<1 for transparent color
planes3d(a=0,b=0,c=-1,d=0.08,alpha=0.5, col='green') ## plane st. joint pdf=0.08
planes3d(a=0,b=0,c=-1,d=0.06,alpha=0.5, col='green') ## plane st. joint pdf=0.06
planes3d(a=0,b=0,c=-1,d=0.04,alpha=0.5, col='green') ## plane st. joint pdf=0.04
grid3d(c("x", "y", "z")) ## add grid in a 3D plot
Let’s see this bivariate normal density in a movie.
movie3d(spin3d(axis=c(0,0,1),rpm=4), duration = 15, movie ="BiNorm", type="gif",
dir="./Rmovies",verbose = FALSE)
knitr::include_graphics("./Rmovies/BiNorm.gif") ## include animation in HTML output
The following widget has an interactive 3D plot of the bivariate normal density.
rglwidget(elementId = "BiNorm") ## Create an interative widget in HTML output
clipplanes3d(a=0,b=0,c=-1,d=0.04) ## slice the bell horizontally
rglwidget(elementId = "BiNorm_cut")
We consider f(x,1), so the density at \(y=1\)
persp3d(x, y, z, col='steelblue', alpha=0.7)
planes3d(a=0, b=-1, c=0, d=1, alpha=0.5, col='green')
grid3d(c("x", "y", "z"))
rglwidget(elementId = "BiNorm_cond")
clipplanes3d(a=0,b=-1,c=0,d=1) ## slice the bell vertically (conditional distribution)
rglwidget(elementId = "BiNorm_cond_cut")
movie3d(spin3d(axis=c(0,0,1),rpm=4), duration = 15, movie ="BiNorm_cond", type="gif",
dir="./Rmovies",verbose = FALSE)
knitr::include_graphics("./Rmovies/BiNorm_cond.gif") ## include animation in HTML output
If we scale this curve by the probability that \(y\) at(around) \(y=1\), then we have the conditional density \(f(x|y=1)\).
Projecting all these bell curves to the x-axis, we have
plot(x,z[,80],type='l',ylab='f(x,fixed y)') ## this curve is for y=1.
for (i in 1:dim(z)[2]){
lines(x,z[,i])
}
The marginal density of \(X\) is the weighted aggregation of all curves in above figure, where the weight is probability that \(Y=y\).
Now we generate a random sample from above Binormal distributions
## Use rmvnorm function to generate multivariate normal data
XYi=data.frame(rmvnorm(1000,mean=c(0,1),sigma=CovXY))
names(XYi)=c('x','y') ## assign column names
Check the scatter plot of data with 4 contour lines, sample mean point and populational mean point.
plot(XYi[,1],XYi[,2],pch=20,col=rgb(0.2,0.2,1,alpha=0.3),
xlab='x',ylab='y',main='Bivariate Normal Random Variables')
## Add the means of X and Y on the plot
points(mean(XYi[,1]),mean(XYi[,2]),col='red',pch=19) ## sample mean
points(0,1,col='green',pch=2,cex=1.5) ## population mean by green triangle
contour(x,y,z,add=T,nlevel=4,labcex=1,lwd=2) ## contour lines
We see the darkest part of x-axis is around 0 and that of y-axis is around 1. The contour lines close to the center are also denser. The following 3D histogram and 2D heat map also show similar observation.
library(plot3D)
## Create cuts:
x_c = cut(XYi[,1], 20)
y_c = cut(XYi[,2], 20)
## Calculate joint counts at cut levels:
z_c <- table(x_c, y_c)
## Plot as a 3D histogram:
hist3D(z=z_c, border="black")
## Plot as a 2D heatmap:
image2D(z=z_c, border="black")
We draw a histogram of \(X\) given \(Y\) is around 1. First define a horizontal strip: \(Y\) is within \(1-delta \sim 1+\delta\). Second we draw histogram of all \(X\) in the defined strip, which looks like normal.
delta=0.5
yv=1
library(ggplot2)
ggplot(XYi, aes(x = x, y = y)) +geom_point()+
geom_point(x=0,y=1,col="red",shape="*",size=15)+
geom_ribbon(aes(ymin=yv-delta,ymax=yv+delta),alpha=0.2,col="blue")
hist(XYi[(XYi[,2]>yv-delta &XYi[,2]<yv+delta),1],
main="Histogram of X given Y",xlab="")
If we push the observations to the Y-axis, we can find the marginal histogram for \(Y\). Similarly, marginal histogram for \(X\) is plotted at the bottom. (The points are jiggled a little to see the frequency.)
library(ggplot2)
scatter <- qplot(x,y, data=XYi) +
scale_x_continuous(limits=c(min(x),max(x))) +
scale_y_continuous(limits=c(min(y),max(y))) +
geom_rug(col=rgb(.5,0,0,alpha=.2))+
geom_point(x=0,y=1,col="red",shape="*",size=15)
scatter
# Plot the scatter plot with marginal histograms
library(ggExtra)
p = ggplot(XYi, aes(x = x, y = y)) +geom_point()+
geom_point(x=0,y=1,col="red",shape="*",size=15)
ggMarginal(p, type = "histogram")
Consider \(E(Y|X=x)\) as a function of \(x\). For example, we take vertical strips at \(x=-1.5,x=0\) and \(x=1.5\), and display histograms of \(Y\) conditional on these values of \(X\). You can see how the conditional mean of \(Y\) increases as the value of \(X\) increases.
delta=0.25
xv=c(-1.5,0,1.5)
#xv=0
library(ggplot2)
p1=ggplot(XYi, aes(x = x, y = y)) +geom_point()+
geom_point(x=0,y=1,col="red",shape="*",size=15)+
geom_ribbon(aes(xmin=xv[1]-delta,xmax=xv[1]+delta),alpha=0.2,col="blue")+
geom_ribbon(aes(xmin=xv[2]-delta,xmax=xv[2]+delta),alpha=0.2,col="blue")+
geom_ribbon(aes(xmin=xv[3]-delta,xmax=xv[3]+delta),alpha=0.2,col="blue")
histdat=c()
ybar.cond=c()
for (i in 1:length(xv)){
yi=XYi[(XYi[,1]>xv[i]-delta &XYi[,1]<xv[i]+delta),2]
histdat=rbind(histdat, cbind(x=xv[i],y=yi) )
ybar.cond=rbind(ybar.cond, cbind(x=xv[i],ybar=mean(yi)) )
}
histdat=as.data.frame(histdat)
ybar.cond=as.data.frame(ybar.cond)
p2=ggplot(histdat, aes(x = y, fill = factor(x))) +
geom_histogram(aes(y = ..density..))+ ## plot histogram for each value of x
geom_density()+ ## overlay density
facet_grid(cols =vars(x))+ ## histogram by group in different plots
geom_vline(data=ybar.cond,aes(xintercept = ybar),colour = "black")+
theme(legend.position = "none")+ ## remove legend
coord_flip() ## rotate the plot
library(gridExtra) ## for arrangment of ggplots
grid.arrange(p1, p2, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This is the idea of a REGRESSION model. Source: Introductory
Statistics