(Perhaps surprisingly, this example is still relevant)
http://www.nature.com/ejhg/journal/v17/n8/full/ejhg20095a.html
Predicting height: the Victorian approach beats modern genomics
Brian Caffo, Jeff Leek and Roger Peng
Johns Hopkins Bloomberg School of Public Health
http://www.nature.com/ejhg/journal/v17/n8/full/ejhg20095a.html
Predicting height: the Victorian approach beats modern genomics
(Simply Statistics is a blog by Jeff Leek, Roger Peng and Rafael Irizarry, who wrote this post, link on the image)
install.packages("UsingR")
if the UsingR
library is not installed.library(UsingR); data(galton); library(reshape); long <- melt(galton)
g <- ggplot(long, aes(x = value, fill = variable))
g <- g + geom_histogram(colour = "black", binwidth=1)
g <- g + facet_grid(. ~ variable)
g
library(manipulate)
myHist <- function(mu){
mse <- mean((galton$child - mu)^2)
g <- ggplot(galton, aes(x = child)) + geom_histogram(fill = "salmon", colour = "black", binwidth=1)
g <- g + geom_vline(xintercept = mu, size = 3)
g <- g + ggtitle(paste("mu = ", mu, ", MSE = ", round(mse, 2), sep = ""))
g
}
manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
g <- ggplot(galton, aes(x = child)) + geom_histogram(fill = "salmon", colour = "black", binwidth=1)
g <- g + geom_vline(xintercept = mean(galton$child), size = 3)
g
\[ \begin{align} \sum_{i=1}^n (Y_i - \mu)^2 & = \ \sum_{i=1}^n (Y_i - \bar Y + \bar Y - \mu)^2 \\ & = \sum_{i=1}^n (Y_i - \bar Y)^2 + \ 2 \sum_{i=1}^n (Y_i - \bar Y) (\bar Y - \mu) +\ \sum_{i=1}^n (\bar Y - \mu)^2 \\ & = \sum_{i=1}^n (Y_i - \bar Y)^2 + \ 2 (\bar Y - \mu) \sum_{i=1}^n (Y_i - \bar Y) +\ \sum_{i=1}^n (\bar Y - \mu)^2 \\ & = \sum_{i=1}^n (Y_i - \bar Y)^2 + \ 2 (\bar Y - \mu) (\sum_{i=1}^n Y_i - n \bar Y) +\ \sum_{i=1}^n (\bar Y - \mu)^2 \\ & = \sum_{i=1}^n (Y_i - \bar Y)^2 + \sum_{i=1}^n (\bar Y - \mu)^2\\ & \geq \sum_{i=1}^n (Y_i - \bar Y)^2 \ \end{align} \]
ggplot(galton, aes(x = parent, y = child)) + geom_point()
Size of point represents number of points at that (X, Y) combination (See the Rmd file for the code).
y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
myPlot <- function(beta){
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
mse <- mean( (y - beta * x) ^2 )
g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
g
}
manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))
lm(I(child - mean(child))~ I(parent - mean(parent)) - 1, data = galton)
Call:
lm(formula = I(child - mean(child)) ~ I(parent - mean(parent)) -
1, data = galton)
Coefficients:
I(parent - mean(parent))
0.646