# Please load the packages. 
library(ggplot2)
library(ipsRdbs)
# Start with a clean workspace 
rm(list=ls())
summary(bodyfat)
##     Skinfold         Bodyfat      
##  Min.   : 28.00   Min.   : 5.630  
##  1st Qu.: 37.52   1st Qu.: 6.968  
##  Median : 47.70   Median : 8.625  
##  Mean   : 51.42   Mean   : 9.251  
##  3rd Qu.: 58.15   3rd Qu.:10.010  
##  Max.   :113.50   Max.   :19.940
plot(bodyfat$Skinfold,  bodyfat$Bodyfat, xlab="Skin", ylab="Fat")

plot(bodyfat$Skinfold,  log(bodyfat$Bodyfat), xlab="Skin", ylab="log Fat")

plot(log(bodyfat$Skinfold),  log(bodyfat$Bodyfat), xlab="log Skin", ylab="log Fat")

old.par <- par(no.readonly = TRUE)
# par(mfrow=c(2,2)) # draws four plots in a graph
plot(bodyfat$Skinfold,  bodyfat$Bodyfat, xlab="Skin", ylab="Fat")

plot(bodyfat$Skinfold,  log(bodyfat$Bodyfat), xlab="Skin", ylab="log Fat")

plot(log(bodyfat$Skinfold),  log(bodyfat$Bodyfat), xlab="log Skin", ylab="log Fat")

plot(1:5, 1:5, axes=FALSE, xlab="", ylab="",  type="n")
text(2, 2, "Log both X and Y")
text(2, 1, "To have the best plot")

# Keep the transformed variables in the data set 
bodyfat$logskin <- log(bodyfat$Skinfold)
bodyfat$logbfat <- log(bodyfat$Bodyfat)
bodyfat$logskin <- log(bodyfat$Skinfold)
par(old.par)
 # Create a grouped variable 
bodyfat$cutskin <- cut(log(bodyfat$Skinfold), breaks=6) 
boxplot(data=bodyfat, Bodyfat~cutskin, col=2:7)

head(bodyfat)
##   Skinfold Bodyfat  logskin  logbfat     cutskin
## 1     44.5    8.47 3.795489 2.136531  (3.57,3.8]
## 2     41.8    7.68 3.732896 2.038620  (3.57,3.8]
## 3     33.7    6.16 3.517498 1.818077 (3.33,3.57]
## 4     50.9    8.56 3.929863 2.147100  (3.8,4.03]
## 5     40.5    6.86 3.701302 1.925707  (3.57,3.8]
## 6     51.2    9.40 3.935740 2.240710  (3.8,4.03]
require(ggplot2)
p2 <- ggplot(data=bodyfat, aes(x=cutskin, y=logbfat)) + 
geom_boxplot(col=2:7) + 
stat_summary(fun=mean, geom="line", aes(group=1), col="blue", linewidth=1) +
labs(x="Skinfold", y="Percentage of log bodyfat", 
title="Boxplot of log-bodyfat percentage vs grouped log-skinfold")  
plot(p2)

# Calculate summary statistics
bodyfat$logskin <- log(bodyfat$Skinfold)
bodyfat$logbfat <- log(bodyfat$Bodyfat)
n <- nrow(bodyfat)
x <- bodyfat$logskin
y <- bodyfat$logbfat
xbar <- mean(x)
ybar <- mean(y)
sx2 <- var(x)
sy2 <- var(y)
sxy <- cov(x, y)
r <- cor(x, y)
print(list(n=n, xbar=xbar, ybar=ybar, sx2=sx2, sy2=sy2, sxy=sxy, r=r))

$n [1] 102

$xbar [1] 3.883108

$ybar [1] 2.175978

$sx2 [1] 0.1084285

$sy2 [1] 0.09106112

$sxy [1] 0.09566069

$r [1] 0.9627095

# Find parameter estimates by simple calculations in R
hatbeta1 <- r * sqrt(sy2/sx2) # calculates estimate of the slope
hatbeta0 <- ybar - hatbeta1 * xbar # calculates estimate of the intercept
rs <-  y - hatbeta0 - hatbeta1 * x  # calculates residuals
s2 <- sum(rs^2)/(n-2)  # calculates estimate of sigma2
print(list(beta0=hatbeta0, beta1=hatbeta1, sigma2=s2))
## $beta0
## [1] -1.249881
## 
## $beta1
## [1] 0.8822466
## 
## $sigma2
## [1] 0.006731453
# Verify the above by model fitting  
bfat.lm <- lm(logbfat ~ logskin, data=bodyfat)
summary(bfat.lm)
## 
## Call:
## lm(formula = logbfat ~ logskin, data = bodyfat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.223408 -0.055661  0.000052  0.056390  0.152984 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.24988    0.09661  -12.94   <2e-16 ***
## logskin      0.88225    0.02479   35.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08205 on 100 degrees of freedom
## Multiple R-squared:  0.9268, Adjusted R-squared:  0.9261 
## F-statistic:  1266 on 1 and 100 DF,  p-value: < 2.2e-16
anova(bfat.lm)
## Analysis of Variance Table
## 
## Response: logbfat
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## logskin     1 8.5240  8.5240  1266.3 < 2.2e-16 ***
## Residuals 100 0.6731  0.0067                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Check the diagnostics 
plot(bfat.lm$fit, bfat.lm$res, xlab="Fitted values", ylab = "Residuals")
abline(h=0)

### Should be a random scatter
qqnorm(bfat.lm$res)
qqline(bfat.lm$res)

plot(bodyfat$logskin,  bodyfat$logbfat, xlab="log Skin", ylab="log Fat")
abline(bfat.lm, col=7)
title("Scatter plot with the fitted Linear Regression line")

# 95% CI for beta(1)
# 0.88225 + c(-1, 1) * qt(0.975, df=100) *  0.02479 
# round(0.88225 + c(-1, 1) * qt(0.975, df=100) *  0.02479, 2)
# To test H0: beta1 = 1. 
tstat <- (0.88225 -1)/0.02479 
pval <- 2 * (1- pt(abs(tstat), df=100))
x <- seq(from=-5, to=5, length=500)
y <- dt(x, df=100)
plot(x, y,  xlab="", ylab="", type="l")
title("T-density with df=100")
abline(v=abs(tstat))
abline(h=0)
x1 <- seq(from=abs(tstat), to=10, length=100)
y1 <- rep(0, length=100)
x2 <- x1
y2 <- dt(x1, df=100)
segments(x1, y1, x2, y2)
abline(h=0)

# Predict at a new value of Skinfold=70
# Create a new data set called new
newx <- data.frame(logskin=log(70))
a <- predict(bfat.lm, newdata=newx, se.fit=TRUE) 
# Confidence interval for the mean of log bodyfat  at skinfold=70
a <- predict(bfat.lm, newdata=newx, interval="confidence") 
a
##        fit      lwr     upr
## 1 2.498339 2.474198 2.52248
# Prediction interval for a future log bodyfat  at skinfold=70
a <- predict(bfat.lm, newdata=newx, interval="prediction") 
a
##        fit      lwr      upr
## 1 2.498339 2.333783 2.662895

#Obtain prediction intervals for the mean 
pred.bfat.clim <- predict(bfat.lm, data=bodyfat, interval="confidence")
#prediction intervals for future observation
pred.bfat.plim <- suppressWarnings(predict(bfat.lm, data=bodyfat, interval="prediction"))
plot(bodyfat$logskin,  bodyfat$logbfat, xlab="log Skin", ylab="log Fat")
abline(bfat.lm, col=5)
lines(log(bodyfat$Skinfold), pred.bfat.clim[,2], lty=2, col=2) 
lines(log(bodyfat$Skinfold), pred.bfat.clim[,3], lty=2, col=2) 
lines(log(bodyfat$Skinfold), pred.bfat.plim[,2], lty=4, col=3) 
lines(log(bodyfat$Skinfold), pred.bfat.plim[,3], lty=4, col=3) 
title("Scatter plot with the fitted line and prediction intervals")
symb <- c("Fitted line", "95% CI for mean", "95% CI for observation")
## legend(locator(1), legend = symb, lty = c(1, 2, 4), col = c(5, 2, 3))
# Shows where we predicted earlier 
abline(v=log(70))