Chapter 3: Exploratory data analysis methods

Sujit K. Sahu

University of Southampton

2022-03-30

Abstract

This file contains all the code for reproducing the figures in Chapter 3.

## Note the start time
start.time<-proc.time()[3]
# Download mapfiles and data files from https://github.com/sujit-sahu/bookbmstdr
figurepath <- "figures/chap3figures/"
mappath <- "mapfiles/"
load(file=paste0(mappath, "englamap.rda"))
load(file=paste0(mappath, "engregmap.rda"))
load(file=paste0(mappath, "aurnsites.rda"))
colpalette <- c("dodgerblue4", "dodgerblue2", "firebrick2",  "firebrick4",  "purple")     
library(bmstdr)
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "numericVector" of class "Mnumeric"; definition not updated
library(ggplot2)
library(akima)
library(tidyr)
library(RColorBrewer)
library(ggpubr)
library(spdep)
library(GGally)
library(geoR)
library(fields)
library(doBy)

1 Code to reproduce Figure3.1

summary(nyspatial)
##     s.index        Longitude         Latitude          utmx       
##  Min.   : 1.00   Min.   :-79.59   Min.   :40.74   Min.   :121813  
##  1st Qu.: 7.75   1st Qu.:-75.99   1st Qu.:41.34   1st Qu.:420017  
##  Median :14.50   Median :-74.08   Median :42.54   Median :576087  
##  Mean   :14.50   Mean   :-75.01   Mean   :42.38   Mean   :500016  
##  3rd Qu.:21.25   3rd Qu.:-73.76   3rd Qu.:43.23   3rd Qu.:602346  
##  Max.   :28.00   Max.   :-72.71   Max.   :44.39   Max.   :692461  
##       utmy              yo3           xmaxtemp         xwdsp      
##  Min.   :4510117   Min.   :39.18   Min.   :26.05   Min.   :4.030  
##  1st Qu.:4577789   1st Qu.:44.92   1st Qu.:26.53   1st Qu.:4.475  
##  Median :4710883   Median :47.24   Median :27.31   Median :5.000  
##  Mean   :4693783   Mean   :47.88   Mean   :27.54   Mean   :5.095  
##  3rd Qu.:4789464   3rd Qu.:51.41   3rd Qu.:28.57   3rd Qu.:5.743  
##  Max.   :4916157   Max.   :57.60   Max.   :29.70   Max.   :6.146  
##       xrh       
##  Min.   :3.455  
##  1st Qu.:3.687  
##  Median :3.725  
##  Mean   :3.742  
##  3rd Qu.:3.868  
##  Max.   :3.950
p <- ggplot(nyspatial, aes(x=yo3)) +
  geom_histogram(binwidth=4.5, color="black", fill="white") +
  labs(x="Average daily ozone concentration", y = "count")
p

ggsave(filename=paste0(figurepath, "hist_nyspatial.png"))
## Saving 7 x 5 in image

p <- ggpairs(nyspatial, columns=6:9)
p

ggsave(filename=paste0(figurepath, "pairs_nyspatial.png"))
## Saving 7 x 5 in image
a <- bmstdr_variogram(data=nyspatial, formula = yo3~utmx + utmy, coordtype="utm", nb=50)

## `geom_smooth()` using formula 'y ~ x'

names(a)
## [1] "cloud"         "variogram"     "cloudplot"     "variogramplot"

ggarrange(a$cloudplot, a$variogramplot, legend = "none", nrow = 1, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'

ggsave(file=paste0(figurepath, "variogram_nyspatial.png"))
## Saving 7 x 5 in image
## Check variogram using the geoR package 

head(nyspatial)
##   s.index Longitude Latitude     utmx    utmy      yo3 xmaxtemp    xwdsp
## 1       1   -73.757   42.681 601838.1 4726140 45.08065 27.53184 4.142083
## 2       2   -73.881   40.866 594300.8 4524485 48.34758 29.60294 5.666202
## 3       3   -79.587   42.291 121812.7 4692285 54.65677 26.28780 5.742751
## 4       4   -76.802   42.111 351019.8 4663672 44.95774 27.44725 4.842872
## 5       5   -73.743   41.782 604457.3 4626336 43.92903 28.27555 4.427053
## 6       6   -78.771   42.993 192581.3 4766941 55.37726 26.61813 6.146428
##        xrh
## 1 3.686737
## 2 3.478130
## 3 3.950145
## 4 3.699996
## 5 3.698446
## 6 3.869971
y <- as.geodata(nyspatial, coords.col=4:5, data.col=6, covar.col=7:9)
vgram <- variog(y, estimator.type="classical")
## variog: computing omnidirectional variogram
vgram.robust <- variog(y, estimator.type="modulus")
## variog: computing omnidirectional variogram
plot(vgram)

plot(vgram.robust)


krig.fit1<- Krig(x=nyspatial[, c("utmx", "utmy")], Y=nyspatial$yo3, Z=as.matrix(nyspatial[, 7:9]))
summary(krig.fit1)
## CALL:
## Krig(x = nyspatial[, c("utmx", "utmy")], Y = nyspatial$yo3, Z = as.matrix(nyspatial[, 
##     7:9]))
##                                              
##  Number of Observations:                28   
##  Number of unique points:               28   
##  Number of parameters in the null space 6    
##  Parameters for fixed spatial drift     3    
##  Effective degrees of freedom:          13.6 
##  Residual degrees of freedom:           14.4 
##  MLE tau                                2.649
##  GCV tau                                2.989
##  MLE sigma                              3.672
##  Scale passed for covariance (sigma)    <NA> 
##  Scale passed for nugget (tau^2)        <NA> 
##  Smoothing parameter lambda             1.911
## 
## Residual Summary:
##      min    1st Q   median    3rd Q      max 
## -5.28400 -1.10700  0.09712  1.29800  4.71800 
## 
## Covariance Model: stationary.cov
##   Covariance function is 
##   Names of non-default covariance arguments: 
##        
## 
## DETAILS ON SMOOTHING PARAMETER:
##  Method used:   REML    Cost:  1
##    lambda       trA       GCV   GCV.one GCV.model    tauHat 
##     1.911    13.557    17.318    17.318        NA     2.989 
## 
##  Summary of all estimates found for lambda
##             lambda   trA   GCV tauHat -lnLike Prof converge
## GCV        0.06797 26.60 17.32 0.9306        59.93       NA
## GCV.model       NA    NA    NA     NA           NA       NA
## GCV.one    0.06797 26.60 17.32 0.9306           NA       NA
## RMSE            NA    NA    NA     NA           NA       NA
## pure error      NA    NA    NA     NA           NA       NA
## REML       1.91130 13.56 17.32 2.9888        59.93        1
predsurface <- predict(krig.fit1, x=gridnyspatial[, c("utmx", "utmy")],  Z=as.matrix(gridnyspatial[, 6:8]))

ksurf <- gridnyspatial[, c("Longitude","Latitude")]
ksurf$krig <- predsurface ## u has the x, y, z

coord <- nyspatial[, c("Longitude","Latitude")]

xo <- seq(from=min(coord$Longitude)-0.5, to = max(coord$Longitude)+0.8, length=200)
yo <- seq(from=min(coord$Latitude)-0.25, to = max(coord$Latitude)+0.8, length=200)
surf <- interp(ksurf$Longitude, ksurf$Latitude, ksurf$krig,  xo=xo, yo=yo)

names(surf)
## [1] "x" "y" "z"
surf <- list(x=surf$x, y=surf$y, z=surf$z)
v <- fnc.delete.map.XYZ(xyz=surf)

interp1 <- data.frame(long = v$x, v$z )
names(interp1)[1:length(v$y)+1] <- v$y
interp1 <- gather(interp1,key = lat,value =Kriged,-long,convert = TRUE)

nymap <- map_data(database="state",regions="new york")
maplonglat <- cbind(nymap$long, nymap$lat)
com <- rev(c("firebrick4","firebrick2","white","dodgerblue2","dodgerblue4"))#colour palette
zr <- range(interp1$Kriged, na.rm=T)
P <- ggplot() +  
  geom_raster(data=interp1, aes(x = long, y = lat,fill = Kriged)) +
  geom_polygon(data=nymap, aes(x=long, y=lat, group=group), color="black", size = 0.6, fill=NA) + 
  geom_point(data=coord, aes(x=Longitude,y=Latitude))  +
  stat_contour(data=na.omit(interp1), aes(x = long, y = lat,z = Kriged), colour = "black", binwidth =0.4) +
  scale_fill_gradientn(colours=com, na.value="gray95", limits=zr) +
  theme(axis.text = element_blank(), axis.ticks = element_blank()) +
  ggsn::scalebar(data =interp1, dist = 100, location = "bottomleft", transform=T, dist_unit = "km",
                 st.dist = .05, st.size = 5, height = .06, st.bottom=T, model="WGS84") +
  ggsn::north(data=interp1, location="topleft", symbol=12) +
  labs(title= "Kriged map of ozone air pollution in New York", x="Longitude", y = "Latitude", size=2.5) 

P

ggsave(file=paste(figurepath, "kriged_ny.png", sep=""))
## Saving 7 x 5 in image

2 Code for exploring spatio-temporal data


fun_mean_var <-  function(x) { 
  c(mean = mean(x, na.rm=T), s2 = var(x, na.rm=T))
}

a <- nysptime 
a$s.index <- as.factor(a$s.index)
a$sqrty <- sqrt(a$y8hrmax)

sum_orig  <- summaryBy(y8hrmax ~ s.index, data = a, FUN = fun_mean_var)
sum_sqrt  <- summaryBy(sqrty ~ s.index, data = a, FUN = fun_mean_var)

sum_orig$s.index <- as.factor(sum_orig$s.index)
sum_sqrt$s.index <- as.factor(sum_sqrt$s.index)

p1 <- ggplot(sum_orig, aes(x=y8hrmax.mean, y=y8hrmax.s2))+ 
  geom_smooth(method="lm", se=F) +
  geom_point(size=1) +   
  theme(legend.position = NULL, legend.key.size = unit(0.5, "cm")) +
  labs( title= "Original scale", x="mean", y="variance") 
p1
## `geom_smooth()` using formula 'y ~ x'


p2 <- ggplot(sum_sqrt, aes(x=sqrty.mean, y=sqrty.s2))+
  geom_point(size=1) +   
  geom_smooth(method="lm", se=F) +
  theme(legend.position = NULL, legend.key.size = unit(0.5, "cm")) +
  labs( title= "Square root scale", x="mean", y="variance") 
p2
## `geom_smooth()` using formula 'y ~ x'


ggarrange(p1, p2, nrow = 1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggsave(file=paste0(figurepath, "mean_variance_nysptime.png"), width=12, height = 6, units="cm")
a <- nysptime 
a$time <- rep(1:62, 28)
a$s.index <- as.factor(a$s.index)
head(a)
sp <- ggplot(data=a, aes(x=time, y=y8hrmax, group=s.index, col=s.index)) +
  geom_line() +
  theme(legend.position="none")
sp 
ggsave(file=paste0(figurepath, "timeseriesplot_nysptime.png"))
p <- ggplot(data=na.omit(nysptime), aes(x=as.factor(s.index), y=y8hrmax)) + 
  geom_boxplot(outlier.colour="red", outlier.shape=8,
               outlier.size=2) + 
  labs(title= "Site wise boxplots", x="Site", y = "Daily ozone", size=2.5) 
p 
ggsave(filename=paste0(figurepath, "boxplots_sptime.png"))
s <- c(8,11,12,14,18,21,24,28)
a <- nysptime 
a$time <- rep(1:62, 28)
a$s.index <- as.factor(a$s.index)
vdat <- a[which(nysptime$s.index%in%s), ]
np <- ggplot(data=vdat, aes(x=time, y=y8hrmax,shape=s.index, col=s.index)) +
  geom_point() + 
  geom_line() + 
  # facet_wrap(~ s.index, ncol=4) +
  labs(x="Day", y = "Daily ozone", size=2.5) 
np 
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 127 rows containing missing values (geom_point).
## Warning: Removed 5 row(s) containing missing values (geom_path).

ggsave(filename=paste0(figurepath, "validation_time_series.png"))
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 127 rows containing missing values (geom_point).
## Warning: Removed 5 row(s) containing missing values (geom_path).
 
phist <- ggplot(nysptime, aes(x=sqrt(y8hrmax))) +
  geom_histogram(binwidth=0.5, color="black", fill="white") +
  labs(x="Square-root ozone", y = "count")
phist
## Warning: Removed 24 rows containing non-finite values (stat_bin).


a <- nysptime 
a$Month <- as.factor(a$Month)
## max temperature
ptmp <- ggplot(a, aes(x=xmaxtemp, y=sqrt(y8hrmax), color=Month)) +
  geom_point(shape=16, size=0.5) +
  stat_smooth(method = "lm", col = "black")+
  labs(x = "Maximum temperature", y = "Square root ozone")+
  # geom_text(aes(label=s.index), hjust = -0.7, show.legend = F)+
  theme(legend.position = c(0.1, 0.85), legend.key.size = unit(0.2, "cm"))

ptmp
## Warning: Removed 24 rows containing non-finite values (stat_smooth).
## Warning: Removed 24 rows containing missing values (geom_point).


pwdsp <- ggplot(a, aes(x=xwdsp, y=sqrt(y8hrmax), color=Month)) +
  geom_point(shape=16, size=0.5) +
  stat_smooth(method = "lm", col = "black")+
  labs(x = "Wind speed", y = "Square root ozone")+
  # geom_text(aes(label=s.index), hjust = -0.7, show.legend = F)+
  theme(legend.position =c(0.1, 0.85), legend.key.size = unit(0.2, "cm"))

pwdsp
## Warning: Removed 24 rows containing non-finite values (stat_smooth).

## Warning: Removed 24 rows containing missing values (geom_point).


prh <- ggplot(a, aes(x=xrh, y=sqrt(y8hrmax), color=Month)) +
  geom_point(shape=16, size=0.5) +
  stat_smooth(method = "lm", col = "black")+
  labs(x = "Relative humidity", y = "Square root ozone")+
  # geom_text(aes(label=s.index), hjust = -0.7, show.legend = F)+
  theme(legend.position =c(0.85, 0.85), legend.key.size = unit(0.2, "cm"))
prh
## Warning: Removed 24 rows containing non-finite values (stat_smooth).

## Warning: Removed 24 rows containing missing values (geom_point).


ggarrange(phist, ptmp, pwdsp, prh, nrow = 2, ncol=2)
## Warning: Removed 24 rows containing non-finite values (stat_bin).
## Warning: Removed 24 rows containing non-finite values (stat_smooth).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing non-finite values (stat_smooth).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing non-finite values (stat_smooth).
## Warning: Removed 24 rows containing missing values (geom_point).

ggsave(file=paste0(figurepath, "nysptime_edaplots.png"), width=12, height = 8, units="cm")

3 EDA for areal data


colnames(engtotals)
##  [1] "Areacode"      "mapid"         "spaceid"       "Region"       
##  [5] "popn"          "jsa"           "houseprice"    "popdensity"   
##  [9] "no2"           "covid"         "allcause"      "noofcases"    
## [13] "Edeaths"       "Ecases"        "logEdeaths"    "logEcases"    
## [17] "casesmr"       "nweek"         "noofhighweeks"
sum(engtotals$covid)
## [1] 49292


## Moran's I

set.seed(44)
summary(engtotals$covid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0    77.0   118.0   157.5   200.0  1223.0

covidrate <- engtotals$covid/engtotals$popn*100000

load(file=paste0(mappath, "/englad.rda"))
nbhood <- poly2nb(englad)
Wlist <-  nb2listw(nbhood, style = "B", zero.policy = T)
moran.mc(engtotals$covid, Wlist, nsim=1000, zero.policy=T)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  engtotals$covid 
## weights: Wlist  
## number of simulations + 1: 1001 
## 
## statistic = 0.33891, observed rank = 1001, p-value = 0.000999
## alternative hypothesis: greater
geary.mc(engtotals$covid, Wlist, nsim=1000, zero.policy=T)
## 
##  Monte-Carlo simulation of Geary C
## 
## data:  engtotals$covid 
## weights: Wlist 
## number of simulations + 1: 1001 
## 
## statistic = 0.80762, observed rank = 29, p-value = 0.02897
## alternative hypothesis: greater
## For the covid rate per 100k 
moran.mc(covidrate, Wlist, nsim=1000, zero.policy=T)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  covidrate 
## weights: Wlist  
## number of simulations + 1: 1001 
## 
## statistic = 0.44945, observed rank = 1001, p-value = 0.000999
## alternative hypothesis: greater
geary.mc(covidrate, Wlist, nsim=1000, zero.policy=T)
## 
##  Monte-Carlo simulation of Geary C
## 
## data:  covidrate 
## weights: Wlist 
## number of simulations + 1: 1001 
## 
## statistic = 0.52418, observed rank = 1, p-value = 0.000999
## alternative hypothesis: greater
head(engtotals)
##    Areacode mapid spaceid    Region   popn jsa houseprice popdensity       no2
## 1 E06000001     0       1 NorthEast  93663 0.6     127000        999  6.092857
## 2 E06000002     1       2 NorthEast 140980 1.1     135000       2616  9.437047
## 3 E06000003     2       3 NorthEast 137150 1.0     132000        560  8.686981
## 4 E06000004     3       4 NorthEast 197348 0.8     148000        963  9.301667
## 5 E06000005     4       5 NorthEast 106803 0.8     141000        541  8.686981
## 6 E06000006     5       6 NorthWest 129410 0.2     146000       1636 18.064286
##   covid allcause noofcases  Edeaths   Ecases logEdeaths logEcases  casesmr
## 1   110      477       616 101.1573 425.0841   4.616676  6.052287 1.449125
## 2   208      765       971 162.2333 639.8295   5.089036  6.461202 1.517592
## 3   134      710       711 150.5695 622.4473   5.014425  6.433659 1.142265
## 4   153      807       980 171.1403 895.6524   5.142483  6.797552 1.094174
## 5    90      522       611 110.7004 484.7192   4.706827  6.183570 1.260524
## 6   129      583       699 123.6366 587.3197   4.817347  6.375569 1.190152
##   nweek noofhighweeks
## 1    20             9
## 2    20             8
## 3    20             8
## 4    20             7
## 5    20             6
## 6    20             9
dim(engtotals)
## [1] 313  19
head(englamap)
##   id     long      lat order  hole piece group  Areacode
## 1  0 447097.0 537152.0     1 FALSE     1   0.1 E06000001
## 2  0 447228.8 537033.4     2 FALSE     1   0.1 E06000001
## 3  0 447280.7 537120.1     3 FALSE     1   0.1 E06000001
## 4  0 447378.1 537095.1     4 FALSE     1   0.1 E06000001
## 5  0 447455.3 537023.7     5 FALSE     1   0.1 E06000001
## 6  0 447550.5 537078.1     6 FALSE     1   0.1 E06000001

bdf <- merge(englamap, engtotals, by.x="id", by.y="mapid", all.y=T, all.x=F)
head(bdf)
##   id     long      lat order  hole piece group Areacode.x Areacode.y spaceid
## 1  0 447097.0 537152.0     1 FALSE     1   0.1  E06000001  E06000001       1
## 2  0 447228.8 537033.4     2 FALSE     1   0.1  E06000001  E06000001       1
## 3  0 447280.7 537120.1     3 FALSE     1   0.1  E06000001  E06000001       1
## 4  0 447378.1 537095.1     4 FALSE     1   0.1  E06000001  E06000001       1
## 5  0 447455.3 537023.7     5 FALSE     1   0.1  E06000001  E06000001       1
## 6  0 447550.5 537078.1     6 FALSE     1   0.1  E06000001  E06000001       1
##      Region  popn jsa houseprice popdensity      no2 covid allcause noofcases
## 1 NorthEast 93663 0.6     127000        999 6.092857   110      477       616
## 2 NorthEast 93663 0.6     127000        999 6.092857   110      477       616
## 3 NorthEast 93663 0.6     127000        999 6.092857   110      477       616
## 4 NorthEast 93663 0.6     127000        999 6.092857   110      477       616
## 5 NorthEast 93663 0.6     127000        999 6.092857   110      477       616
## 6 NorthEast 93663 0.6     127000        999 6.092857   110      477       616
##    Edeaths   Ecases logEdeaths logEcases  casesmr nweek noofhighweeks
## 1 101.1573 425.0841   4.616676  6.052287 1.449125    20             9
## 2 101.1573 425.0841   4.616676  6.052287 1.449125    20             9
## 3 101.1573 425.0841   4.616676  6.052287 1.449125    20             9
## 4 101.1573 425.0841   4.616676  6.052287 1.449125    20             9
## 5 101.1573 425.0841   4.616676  6.052287 1.449125    20             9
## 6 101.1573 425.0841   4.616676  6.052287 1.449125    20             9

bdf$smr <- bdf$covid/bdf$Edeaths
summary(bdf$smr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1264  0.5625  0.8216  0.8040  1.0217  2.0711
psmr <-  ggplot(data=bdf, aes(x=long, y=lat, group = group, fill=smr)) +
  scale_fill_gradientn(colours=colpalette, na.value="black",limits=c(0.1, 2.1))  +
  geom_polygon(colour='black',size=0.25) +
  geom_polygon(data=engregmap, aes(x=long, y=lat, group = group), fill=NA, colour='black',size=0.6)  +
  coord_equal() + guides(fill=guide_colorbar(title="SMR")) +
  theme_bw()+theme(text=element_text(family="Times")) +
  labs(title= "SMR of covid deaths during Mar 13 to July 31, 2020",  x="", y = "", size=2.5) +
  theme(axis.text.x = element_blank(), axis.text.y = element_blank(),axis.ticks = element_blank())   +
  theme(legend.position =c(0.2, 0.5)) + 
  ggsn::scalebar(data=bdf, dist =50, location = "topleft", transform=F, dist_unit = "km",
                 st.dist = .05, st.size =4, height = .06, st.bottom=T)
psmr


ggsave(filename=paste0(figurepath, "smr_covid_death.png"))

summary(bdf$popdensity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    25.0   142.0   235.0   739.5   575.0 16427.0
a <- log(range(bdf$popdensity))
ppop <-  ggplot(data=bdf, aes(x=long, y=lat, group = group, fill=log(popdensity))) +
  scale_fill_gradientn(colours=colpalette, na.value="black",limits=a)  +
  geom_polygon(colour='black',size=0.25) +
  geom_polygon(data=engregmap, aes(x=long, y=lat, group = group), fill=NA, colour='black',size=0.6)  +
  coord_equal() + guides(fill=guide_colorbar(title="Log pop")) +
  theme_bw()+theme(text=element_text(family="Times")) +
  labs(title= "Log population density in 2019",  x="", y = "", size=2.5) +
  theme(axis.text.x = element_blank(), axis.text.y = element_blank(),axis.ticks = element_blank())   +
  theme(legend.position =c(0.2, 0.5)) + 
  ggsn::scalebar(data=bdf, dist =50, location = "topleft", transform=F, dist_unit = "km",
                 st.dist = .05, st.size =4, height = .06, st.bottom=T)
ppop


ggsave(filename=paste0(figurepath, "log_pop_density.png"))

summary(bdf$jsa)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   0.200   0.200   0.298   0.300   1.500
a <- range(bdf$jsa)
pjsa <-  ggplot(data=bdf, aes(x=long, y=lat, group = group, fill=jsa)) +
  scale_fill_gradientn(colours=colpalette, na.value="black",limits=a)  +
  geom_polygon(colour='black',size=0.25) +
  geom_polygon(data=engregmap, aes(x=long, y=lat, group = group), fill=NA, colour='black',size=0.6)  +
  coord_equal() + guides(fill=guide_colorbar(title="JSA %")) +
  theme_bw()+theme(text=element_text(family="Times")) +
  labs(title= "Percentage of people receiving job seeker's allowance",  x="", y = "", size=2.5) +
  theme(axis.text.x = element_blank(), axis.text.y = element_blank(),axis.ticks = element_blank())   +
  theme(legend.position =c(0.2, 0.5)) + 
  ggsn::scalebar(data=bdf, dist =50, location = "topleft", transform=F, dist_unit = "km",
                 st.dist = .05, st.size =4, height = .06, st.bottom=T)
pjsa


ggsave(filename=paste0(figurepath, "england_jsa_dec_2019.png"))

summary(log10(bdf$houseprice))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.987   5.301   5.380   5.384   5.462   6.114
a <- range(log10(bdf$houseprice))
pprice <-  ggplot(data=bdf, aes(x=long, y=lat, group = group, fill=log10(houseprice))) +
  scale_fill_gradientn(colours=colpalette, na.value="black",limits=a)  +
  geom_polygon(colour='black',size=0.25) +
  geom_polygon(data=engregmap, aes(x=long, y=lat, group = group), fill=NA, colour='black',size=0.6)  +
  coord_equal() + guides(fill=guide_colorbar(title="log10 price")) +
  theme_bw()+theme(text=element_text(family="Times")) +
  labs(title= "Log (base 10) median house price in March 2020",  x="", y = "", size=2.5) +
  theme(axis.text.x = element_blank(), axis.text.y = element_blank(),axis.ticks = element_blank())   +
  theme(legend.position =c(0.2, 0.5)) + 
  ggsn::scalebar(data=bdf, dist =50, location = "topleft", transform=F, dist_unit = "km",
                 st.dist = .05, st.size =4, height = .06, st.bottom=T)
pprice


ggsave(filename=paste0(figurepath, "england_house_price_mar_2020.png"))

# ggarrange(psmr, ppop, pjsa, pprice, nrow = 2, ncol=2)
# ggsave(file=paste0(figurepath, "endeaths_socio_eco.png"), width=12, height =12, units="cm")



summary(bdf$noofcases)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      54     330     547     788     892    5260
bdf$caserate <- 100000 * bdf$noofcases/(bdf$popn*20) # 20 weeks 
summary(bdf$caserate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.955  14.312  18.249  19.722  24.656  70.591
a <- range(bdf$caserate)

p32 <-  ggplot(data=bdf, aes(x=long, y=lat, group = group, fill=caserate)) +
  scale_fill_gradientn(colours=colpalette, na.value="black",limits=a)  +
  geom_polygon(colour='black',size=0.25) +
  geom_polygon(data=engregmap, aes(x=long, y=lat, group = group), fill=NA, colour='black',size=0.6)  +
  coord_equal() + guides(fill=guide_colorbar(title="Case rate")) +
  theme_bw()+theme(text=element_text(family="Times")) +
  labs(title= "Weekly cases per 100k  during Mar 13 to Jul 31, 2020",  x="", y = "", size=2.5) +
  theme(axis.text.x = element_blank(), axis.text.y = element_blank(),axis.ticks = element_blank())   +
  theme(legend.position =c(0.2, 0.5)) + 
  ggsn::scalebar(data=bdf, dist =50, location = "topleft", transform=F, dist_unit = "km",
                 st.dist = .05, st.size =4, height = .06, st.bottom=T)
p32


ggsave(filename=paste0(figurepath, "england_case_rate.png"))



engdeaths$caserate <- 100000*engdeaths$noofcases/engdeaths$popn
summary(engdeaths$caserate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.503  13.847  21.711  33.030 277.409
ctime <- ggplot(data=engdeaths,  aes(x=factor(Weeknumber), y=caserate)) +
  geom_boxplot() +
  labs(#title="Boxplot of square-root precipitation against months by year",
    x = "Week", y = "No of cases per 100,000")  +
  stat_summary(fun=median, geom="line", aes(group=1, col="red")) +
  theme(legend.position = "none")
ctime


ggsave(filename =paste0(figurepath, "eng_boxplot_case_rate.png"))


colnames(engdeaths)
##  [1] "Areacode"      "mapid"         "spaceid"       "Region"       
##  [5] "popn"          "jsa"           "houseprice"    "popdensity"   
##  [9] "startdate"     "Weeknumber"    "no2"           "covid"        
## [13] "allcause"      "noofcases"     "lag1noofcases" "lag2noofcases"
## [17] "lag3noofcases" "lag4noofcases" "n0"            "n1"           
## [21] "n2"            "n3"            "n4"            "Edeaths"      
## [25] "Ecases"        "logEdeaths"    "logEcases"     "highdeathsmr" 
## [29] "caserate"
engdeaths$deathrate <- engdeaths$covid * 100000/engdeaths$popn
engdeaths$caserate <- engdeaths$noofcases * 100000/engdeaths$popn
u <- engdeaths[, c("Weeknumber", "deathrate", "caserate", "no2")]
head(u)
##   Weeknumber deathrate   caserate       no2
## 1         11  0.000000   2.135315  8.285714
## 2         12  1.067657   4.270630 12.000000
## 3         13  1.067657  20.285492  6.142857
## 4         14  5.338287  53.382873  7.142857
## 5         15 14.947204  48.044585  8.571429
## 6         16 10.676575 105.698088  2.428571
wkmeans <- aggregate.data.frame(u[, -1], by=list(Week=u[,1]), FUN=mean)
head(wkmeans)
##   Week   deathrate  caserate      no2
## 1   11  0.06751427  6.801892 18.89352
## 2   12  0.57516468 19.377759 20.83553
## 3   13  2.82722575 37.082191 12.63644
## 4   14  7.97997210 45.369661 18.88982
## 5   15 12.90445264 41.083810 17.69912
## 6   16 13.30915947 47.370480 12.46674

deathcolor <- "#FF0000"
casecolor <- rgb(0.2, 0.6, 0.9, 1)

twoplots <- ggplot(data=wkmeans, aes(x=Week)) +
  geom_bar( aes(y=caserate), stat="identity", size=.1, fill=casecolor, color="blue", alpha=.4) +
  geom_line(aes(y=deathrate), size=1, color=deathcolor) + 
  scale_y_continuous(
    # Features of the first axis
    name = "Deathrate per 100,000",
    # Add a second axis and specify its features
    sec.axis = sec_axis(~.*1, name="Caserate per 100,000")
  ) + 
  theme(
    axis.title.y = element_text(color = deathcolor, size=13),
    axis.title.y.right = element_text(color = casecolor, size=13)
  ) +
  ggtitle("Weekly death (left axis) and case (right axis) rate per 100,000")
twoplots



pdf(file=paste0(figurepath, "death_case_by_week.pdf"))
twoplots
dev.off()
## png 
##   2

summary(bdf$noofcases/bdf$Ecases)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1743  0.6307  0.8042  0.8691  1.0865  3.1108
bdf$casesmr <- bdf$noofcases/bdf$Ecases
summary(bdf$casesmr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1743  0.6307  0.8042  0.8691  1.0865  3.1108
a <- range(bdf$casesmr)

p31 <-  ggplot(data=bdf, aes(x=long, y=lat, group = group, fill=casesmr)) +
  scale_fill_gradientn(colours=colpalette, na.value="black",limits=a)  +
  geom_polygon(colour='black',size=0.25) +
  geom_polygon(data=engregmap, aes(x=long, y=lat, group = group), fill=NA, colour='black',size=0.6)  +
  coord_equal() + guides(fill=guide_colorbar(title="Case SMR")) +
  theme_bw()+theme(text=element_text(family="Times")) +
  labs(title= "Standardised case morbidity rate",  x="", y = "", size=2.5) +
  theme(axis.text.x = element_blank(), axis.text.y = element_blank(),axis.ticks = element_blank())   +
  theme(legend.position =c(0.2, 0.5)) + 
  ggsn::scalebar(data=bdf, dist =50, location = "topleft", transform=F, dist_unit = "km",
                 st.dist = .05, st.size =4, height = .06, st.bottom=T)
p31


ggsave(filename=paste0(figurepath, "england_case_smr.png"))


no2eobs <- ggplot(data=bdf, aes(x=long, y=lat, group = group, fill=no2)) +
  scale_fill_gradientn(colours=colpalette, na.value="black",limits=range(bdf$no2))  +
  geom_polygon(colour='black',size=0.25) +  
  geom_polygon(data=engregmap, aes(x=long, y=lat, group = group), fill=NA, colour='black',size=0.6)  + 
  coord_equal() + guides(fill=guide_colorbar(title="Mean NO2")) + 
  theme_bw()+theme(text=element_text(family="Times")) + 
  labs(title= "Average NO2 levels",  x="", y = "", size=2.5) + 
  theme(axis.text.x = element_blank(), axis.text.y = element_blank(),axis.ticks = element_blank())   +
  ggsn::scalebar(data=bdf, dist =50, location = "topleft", transform=F, dist_unit = "km",
                 st.dist = .05, st.size =4, height = .06, st.bottom=T) +
  theme(legend.position =c(0.2, 0.5)) + 
  geom_point(data=aurnsites, aes(x=Easting, y=Northing), inherit.aes = FALSE, size = 1, colour = "black") 
no2eobs


ggsave(filename=paste0(figurepath, "mean_no2_by_LA.png"))

pno2time <- ggplot(data=engdeaths,  aes(x=factor(Weeknumber), y=no2)) + geom_boxplot() + 
  # facet_wrap(~year, ncol=5) + 
  theme(legend.position = "none") +
  labs(#title="Boxplot of square-root precipitation against months by year", 
    x = "Week", y = "NO2 levels")  +
  stat_summary(fun=median, geom="line", aes(group=1, col="0"))
pno2time


ggsave(filename=paste0(figurepath, "no2_boxplots.png"))

p51 <- ggplot(data=wkmeans, aes(x=Week)) +
  geom_bar( aes(y=caserate/2.5), stat="identity", size=.1, fill=casecolor, color="blue", alpha=.4) +
  geom_line(aes(y=no2), size=1, color=deathcolor) + 
  scale_y_continuous(
    # Features of the first axis
    name = "Average NO2 levels",
    # Add a second axis and specify its features
    sec.axis = sec_axis(~.*2.5, name="Caserate per 100,000")
  ) + 
  # theme_ipsum() +
  theme(
    axis.title.y = element_text(color = deathcolor, size=13),
    axis.title.y.right = element_text(color = casecolor, size=13)
  ) +
  ggtitle("Weekly NO2 levels (left) and case rate per 100,000 (right)")
p51


pdf(file=paste0(figurepath, "no2_case_by_week.pdf"))
p51
dev.off()
## png 
##   2

head(engdeaths)
##    Areacode mapid spaceid    Region  popn jsa houseprice popdensity  startdate
## 1 E06000001     0       1 NorthEast 93663 0.6     127000        999 2020-03-13
## 2 E06000001     0       1 NorthEast 93663 0.6     127000        999 2020-03-20
## 3 E06000001     0       1 NorthEast 93663 0.6     127000        999 2020-03-27
## 4 E06000001     0       1 NorthEast 93663 0.6     127000        999 2020-04-03
## 5 E06000001     0       1 NorthEast 93663 0.6     127000        999 2020-04-10
## 6 E06000001     0       1 NorthEast 93663 0.6     127000        999 2020-04-17
##   Weeknumber       no2 covid allcause noofcases lag1noofcases lag2noofcases
## 1         11  8.285714     0       18         2             0             0
## 2         12 12.000000     1       24         4             2             0
## 3         13  6.142857     1       18        19             4             2
## 4         14  7.142857     5       31        50            19             4
## 5         15  8.571429    14       33        45            50            19
## 6         16  2.428571    10       35        99            45            50
##   lag3noofcases lag4noofcases         n0         n1         n2         n3
## 1             0             0 -2.3634075 -7.6009025 -7.6009025 -7.6009025
## 2             0             0 -1.6702603 -2.3634075 -7.6009025 -7.6009025
## 3             0             0 -0.1121157 -1.6702603 -2.3634075 -7.6009025
## 4             2             0  0.8554683 -0.1121157 -1.6702603 -2.3634075
## 5             4             2  0.7501078  0.8554683 -0.1121157 -1.6702603
## 6            19             4  1.5385652  0.7501078  0.8554683 -0.1121157
##          n4  Edeaths  Ecases logEdeaths logEcases highdeathsmr   caserate
## 1 -7.600902 3.817255 21.2542   1.339532  3.056555            0   2.135315
## 2 -7.600902 5.089673 21.2542   1.627214  3.056555            0   4.270630
## 3 -7.600902 3.817255 21.2542   1.339532  3.056555            0  20.285492
## 4 -7.600902 6.574161 21.2542   1.883147  3.056555            0  53.382873
## 5 -2.363408 6.998301 21.2542   1.945667  3.056555            1  48.044585
## 6 -1.670260 7.422440 21.2542   2.004508  3.056555            1 105.698088
##   deathrate
## 1  0.000000
## 2  1.067657
## 3  1.067657
## 4  5.338287
## 5 14.947204
## 6 10.676575
engdeaths$casesmr <- engdeaths$noofcases/engdeaths$Ecases
engdeaths$smr <- engdeaths$covid/engdeaths$Edeaths
dp1 <- engdeaths[, c("jsa", "houseprice", "popdensity", "no2", "casesmr", "smr")]
head(dp1)
##   jsa houseprice popdensity       no2    casesmr       smr
## 1 0.6     127000        999  8.285714 0.09409903 0.0000000
## 2 0.6     127000        999 12.000000 0.18819807 0.1964763
## 3 0.6     127000        999  6.142857 0.89394082 0.2619684
## 4 0.6     127000        999  7.142857 2.35247583 0.7605533
## 5 0.6     127000        999  8.571429 2.11722825 2.0004857
## 6 0.6     127000        999  2.428571 4.65790215 1.3472659
dp1$no2 <- sqrt(dp1$no2)
dp1$houseprice <- log10(dp1$houseprice)
dp1$popdensity <- log(dp1$popdensity)
head(dp1)
##   jsa houseprice popdensity      no2    casesmr       smr
## 1 0.6   5.103804   6.906755 2.878492 0.09409903 0.0000000
## 2 0.6   5.103804   6.906755 3.464102 0.18819807 0.1964763
## 3 0.6   5.103804   6.906755 2.478479 0.89394082 0.2619684
## 4 0.6   5.103804   6.906755 2.672612 2.35247583 0.7605533
## 5 0.6   5.103804   6.906755 2.927700 2.11722825 2.0004857
## 6 0.6   5.103804   6.906755 1.558387 4.65790215 1.3472659

colnames(dp1) <- c("jsa", "price",  "popden",   "no2", "casesmr", "deathsmr")

p3 <- dp1 %>% ggpairs(., # mapping = ggplot2::aes(colour="blue"),
                      lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1),
                                   discrete = "blank", colpalettebo = wrap("box_no_facet", alpha=0.5)),
                      diag = list(discrete="barDiag",
                                  continuous = wrap("densityDiag", alpha=0.5 )),
                      upper = list(colpalettebo = "blank", discrete = "blank", continuous = wrap("cor", family="sans", size=4, align_percent=0.5))) +
  theme(panel.grid.major = element_blank())
p3


ggsave(filename=paste0(figurepath, "covd_and_socio.png"))

# All done 
end.time <- proc.time()[3]
comp.time<- (end.time-start.time)/60
# comp.time<-fancy.time(comp.time)
print(comp.time)
##  elapsed 
## 1.398633