Hands-on_Ex04

Author

Huynh Minh Phuong

Visual Statistical analysis

Getting Started

Install and launch R packages

pacman::p_load(ggstatsplot, tidyverse)

Import data

exam <- read_csv("data/Exam_data.csv")

One-sample test: gghistostats() method

A Bayes factor is the ratio of the likelihood of one particular hypothesis to the likelihood of another. It can be interpreted as a measure of the strength of evidence in favor of one theory among two competing theories.

set.seed(1234)

gghistostats(
  data = exam,
  x = ENGLISH,
  type = "bayes",
  test.value = 60,
  xlab = "English scores"
)

Two-sample mean test: ggbetweenstats() method

Two-sample mean test of Maths scores by gender.

ggbetweenstats(
  data = exam,
  x = GENDER, 
  y = MATHS,
  type = "np",
  messages = FALSE
)

“ns” → only non-significant “s” → only significant “all” → everything

Oneway ANOVA Test: ggbetweenstats() method

One-way ANOVA test on English score by race.

ggbetweenstats(
  data = exam,
  x = RACE, 
  y = ENGLISH,
  type = "p",
  mean.ci = TRUE, 
  pairwise.comparisons = TRUE, 
  pairwise.display = "s",
  p.adjust.method = "fdr",
  messages = FALSE
)

Significant Test of Correlation: ggscatterstats()

Significant Test of Correlation between Maths scores and English scores.

ggscatterstats(
  data = exam,
  x = MATHS,
  y = ENGLISH,
  marginal = FALSE,
  )

Significant Test of Association: ggbarstats() methods

Maths scores is binned into a 4-class variable by using cut().

exam1 <- exam %>% 
  mutate(MATHS_bins = 
           cut(MATHS, 
               breaks = c(0,60,75,85,100))
)

Significant Test of Association between Maths score and gender.

ggbarstats(exam1, 
           x = MATHS_bins, 
           y = GENDER)

Visualizing Models

Toyota Corolla case study will be used. The purpose of study is to build a model to discover factors affecting prices of used-cars by taking into consideration a set of explanatory variables.

Install and load neccessary libraries and import dataset

pacman::p_load(readxl, performance, parameters, see)
car_resale <- read_xls("data/ToyotaCorolla.xls", 
                       "data")
car_resale

# A tibble: 1,436 × 38
      Id Model    Price Age_08_04 Mfg_Month Mfg_Year     KM Quarterly_Tax Weight
   <dbl> <chr>    <dbl>     <dbl>     <dbl>    <dbl>  <dbl>         <dbl>  <dbl>
 1    81 TOYOTA … 18950        25         8     2002  20019           100   1180
 2     1 TOYOTA … 13500        23        10     2002  46986           210   1165
 3     2 TOYOTA … 13750        23        10     2002  72937           210   1165
 4     3  TOYOTA… 13950        24         9     2002  41711           210   1165
 5     4 TOYOTA … 14950        26         7     2002  48000           210   1165
 6     5 TOYOTA … 13750        30         3     2002  38500           210   1170
 7     6 TOYOTA … 12950        32         1     2002  61000           210   1170
 8     7  TOYOTA… 16900        27         6     2002  94612           210   1245
 9     8 TOYOTA … 18600        30         3     2002  75889           210   1245
10    44 TOYOTA … 16950        27         6     2002 110404           234   1255
# ℹ 1,426 more rows
# ℹ 29 more variables: Guarantee_Period <dbl>, HP_Bin <chr>, CC_bin <chr>,
#   Doors <dbl>, Gears <dbl>, Cylinders <dbl>, Fuel_Type <chr>, Color <chr>,
#   Met_Color <dbl>, Automatic <dbl>, Mfr_Guarantee <dbl>,
#   BOVAG_Guarantee <dbl>, ABS <dbl>, Airbag_1 <dbl>, Airbag_2 <dbl>,
#   Airco <dbl>, Automatic_airco <dbl>, Boardcomputer <dbl>, CD_Player <dbl>,
#   Central_Lock <dbl>, Powered_Windows <dbl>, Power_Steering <dbl>, …

Multiple Regression Model using lm()

model <- lm(Price ~ Age_08_04 + Mfg_Year + KM + 
              Weight + Guarantee_Period, data = car_resale)
model


Call:
lm(formula = Price ~ Age_08_04 + Mfg_Year + KM + Weight + Guarantee_Period, 
    data = car_resale)

Coefficients:
     (Intercept)         Age_08_04          Mfg_Year                KM  
      -2.637e+06        -1.409e+01         1.315e+03        -2.323e-02  
          Weight  Guarantee_Period  
       1.903e+01         2.770e+01

Model Diagnostic: checking for multicolinearity:

Use check_collinearity() of performance package

check_collinearity(model)

# Check for Multicollinearity

Low Correlation

             Term  VIF     VIF 95% CI Increased SE Tolerance Tolerance 95% CI
               KM 1.46 [ 1.37,  1.57]         1.21      0.68     [0.64, 0.73]
           Weight 1.41 [ 1.32,  1.51]         1.19      0.71     [0.66, 0.76]
 Guarantee_Period 1.04 [ 1.01,  1.17]         1.02      0.97     [0.86, 0.99]

High Correlation

      Term   VIF     VIF 95% CI Increased SE Tolerance Tolerance 95% CI
 Age_08_04 31.07 [28.08, 34.38]         5.57      0.03     [0.03, 0.04]
  Mfg_Year 31.16 [28.16, 34.48]         5.58      0.03     [0.03, 0.04]

check_c <- check_collinearity(model)
plot(check_c)

Model Diagnostic: checking normality assumption

Use check_normality() of performance package

model1 <- lm(Price ~ Age_08_04 + KM + 
              Weight + Guarantee_Period, data = car_resale)

check_n <- check_normality(model1)

plot(check_n)

Model diagnostic: Check model for homogeneity of variances

Use check_heteroscedasticity() of performance package

check_h <- check_heteroscedasticity(model1)

plot(check_h)

Model Diagnostic: Complete check

Use check_model().

check_model(model1)

Visualising Regression Parameters: see methods

plot() of see package and parameters() of parameters package is used to visualise the parameters of a regression model.

plot(parameters(model1))

Visualising Regression ParametersL ggcoefstats() methods

Use ggcoefstats() of ggstatsplot package to visualise the parameters of a regression model

ggcoefstats(model1, 
            output = "plot")

Visualizing Uncertainty

Visualizing Uncertainty of Point Estimates

pacman::p_load(tidyverse, plotly, crosstalk, DT, ggdist, gganimate)
exam <- read_csv("data/Exam_data.csv")

Visualizing uncertainty of point estimates: ggplot2 methods

my_sum <- exam %>%
  group_by(RACE) %>%
  summarise(
    n=n(),
    mean=mean(MATHS),
    sd=sd(MATHS)
    ) %>%
  mutate(se=sd/sqrt(n-1))

knitr::kable(head(my_sum), format = 'html')

RACE	n	mean	sd	se
Chinese	193	76.50777	15.69040	1.132357
Indian	12	60.66667	23.35237	7.041005
Malay	108	57.44444	21.13478	2.043177
Others	9	69.66667	10.72381	3.791438

Plot the standard error of mean maths score by race.

ggplot(my_sum) +
  geom_errorbar(
    aes(x=reorder(RACE,-mean), 
        ymin=mean-se, 
        ymax=mean+se), 
    width=0.2, 
    colour="black", 
    alpha=0.9, 
    size=0.5) +
  geom_point(aes
           (x=RACE, 
            y=mean), 
           stat="identity", 
           color="red",
           size = 1.5,
           alpha=1) +
  labs(x="Race") +
  ggtitle("Standard error of mean 
          maths score by race")

Plot the 95% confidence interval of mean maths score by race.

ggplot(my_sum) +
  geom_errorbar(
    aes(x=reorder(RACE,-mean), 
        ymin=mean-1.96*se, 
        ymax=mean+1.96*se), 
    width=0.2, 
    colour="black", 
    alpha=0.9, 
    size=0.5) +
  geom_point(aes
           (x=RACE, 
            y=mean), 
           stat="identity", 
           color="red",
           size = 1.5,
           alpha=1) +
  labs(x="Race") +
  ggtitle("95% confidence interval of mean 
          maths score by race")+
  theme_minimal()

Plot interactive error bars for the 99% confidence interval of mean maths score by race.

d <- highlight_key(my_sum) 
p <- ggplot(d) +
  geom_errorbar(
    aes(x=reorder(RACE,-mean), 
        ymin=mean-2.58*se, 
        ymax=mean+2.58*se), 
    width=0.2, 
    colour="black", 
    alpha=0.9, 
    size=0.5) +
  geom_point(aes
           (x=RACE, 
            y=mean,
            text=paste("Race:", RACE,
                     "<br>N:", n,
                     "<br>Avg. Scores:", round(mean, digits = 2),
                     "<br>99% CI:[", round(mean - 2.58*se, digits = 2), ", ", round(mean + 2.58*se, digits = 2), "]")), 
           stat="identity", 
           color="red",
           size = 1.5,
           alpha=1) +
  labs(x="Race") +
  ggtitle("99% confidence interval of mean 
          maths score by race")+
  theme_minimal()

gg <- highlight(ggplotly(p, tooltip="text"),        
                "plotly_selected")

dt <- DT::datatable(d,
                    colnames = c("","No. of pupils", "Avg Scores", "Std Dev", "Std Error")) |> 
  formatRound(columns = c("mean", "sd", "se"), digits = 2)

crosstalk::bscols(gg,               
                  dt, 
                  widths = 5)

Visualising Uncertainty: ggdist package

stat_pointinterval() of ggdist is used to build a visual for displaying distribution of maths scores by race.

exam %>%
  ggplot(aes(x = RACE, 
             y = MATHS)) +
  stat_pointinterval() +   #<<
  labs(
    title = "Visualising confidence intervals of mean math score",
    subtitle = "Mean Point + Multiple-interval plot")

Showing 95% and 99% confidence interval with mean

exam %>% 
  ggplot(aes(x = RACE,
             y = MATHS)) +
  
  #refer to point_interval argument in stat_pointinterval() help
  stat_pointinterval(
    .point = mean,
    .interval = c(qi(0.05), qi(0.01))    
  ) +
  
  labs(
    title = "Visualising confidence intervals of mean math score",
    subtitle = "Mean Point + Multiple-interval plot"
  )

stat_gradientinterval() of ggdist is used to build a visual for displaying distribution of maths scores by race.

exam %>%
  ggplot(aes(x = RACE, 
             y = MATHS)) +
  stat_gradientinterval(   
    fill = "skyblue",      
    show.legend = TRUE     
  ) +                        
  labs(
    title = "Visualising confidence intervals of mean math score",
    subtitle = "Gradient + interval plot")

Visualising Uncertainty with Hypothetical Outcome Plots (HOPs)

#devtools::install_github("wilkelab/ungeviz", force = TRUE)
library(ungeviz)

ggplot(data = exam, 
       (aes(x = factor(RACE), y = MATHS))) +
  geom_point(position = position_jitter(
    height = 0.3, width = 0.05), 
    size = 0.4, color = "#0072B2", alpha = 1/2) +
  geom_hpline(data = sampler(25, group = RACE), height = 0.6, color = "#D55E00") +
  theme_bw() + 
  # `.draw` is a generated column indicating the sample draw
  transition_states(.draw, 1, 3)+
  xlab("Race")

Funnel Plots for Fair Comparisons

Getting started

pacman::p_load(tidyverse, FunnelPlotR, plotly, knitr)

covid19 <- read_csv("data/COVID-19_DKI_Jakarta.csv") %>%
  mutate_if(is.character, as.factor)

covid19

# A tibble: 267 × 7
   `Sub-district ID` City       District `Sub-district` Positive Recovered Death
               <dbl> <fct>      <fct>    <fct>             <dbl>     <dbl> <dbl>
 1        3172051003 JAKARTA U… PADEMAN… ANCOL              1776      1691    26
 2        3173041007 JAKARTA B… TAMBORA  ANGKE              1783      1720    29
 3        3175041005 JAKARTA T… KRAMAT … BALE KAMBANG       2049      1964    31
 4        3175031003 JAKARTA T… JATINEG… BALI MESTER         827       797    13
 5        3175101006 JAKARTA T… CIPAYUNG BAMBU APUS         2866      2792    27
 6        3174031002 JAKARTA S… MAMPANG… BANGKA             1828      1757    26
 7        3175051002 JAKARTA T… PASAR R… BARU               2541      2433    37
 8        3175041004 JAKARTA T… KRAMAT … BATU AMPAR         3608      3445    68
 9        3171071002 JAKARTA P… TANAH A… BENDUNGAN HIL…     2012      1937    38
10        3175031002 JAKARTA T… JATINEG… BIDARA CINA        2900      2773    52
# ℹ 257 more rows

FunnelPlotR methods

FunnelPlotR package requires a numerator (events of interest), denominator (population to be considered) and group:

limit: plot limits (95 or 99). label_outliers: label outliers (true or false). Poisson_limits: add Poisson limits to the plot. OD_adjust: add overdispersed limits to the plot. xrange and yrange: specify the range to display for axes

funnel_plot(
  numerator = covid19$Death,
  denominator = covid19$Positive,
  group = covid19$`Sub-district`,
  data_type = "PR",   
  xrange = c(0, 6500),  
  yrange = c(0, 0.05),
  label = NA,
  title = "Cumulative COVID-19 Fatality Rate by Cumulative Total Number of COVID-19 Positive Cases", #<<           
  x_label = "Cumulative COVID-19 Positive Cases", #<<
  y_label = "Cumulative Fatality Rate"  #<<
)

A funnel plot object with 267 points of which 7 are outliers. 
Plot is adjusted for overdispersion.

Funnel Plot for Fair Visual Comparison: ggplot2 methods

Derive cumulative death rate and standard error of cumulative death rate.

df <- covid19 %>%
  mutate(rate = Death / Positive) %>%
  mutate(rate.se = sqrt((rate*(1-rate)) / (Positive))) %>%
  filter(rate > 0)

Compute fit.mean

fit.mean <- weighted.mean(df$rate, 1/df$rate.se^2)

Calculate lower and upper limits for 95% and 99.9% CI

number.seq <- seq(1, max(df$Positive), 1)
number.ll95 <- fit.mean - 1.96 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
number.ul95 <- fit.mean + 1.96 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
number.ll999 <- fit.mean - 3.29 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
number.ul999 <- fit.mean + 3.29 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
dfCI <- data.frame(number.ll95, number.ul95, number.ll999, 
                   number.ul999, number.seq, fit.mean)

Plot a static funnel plot

p <- ggplot(df, aes(x = Positive, y = rate)) +
  geom_point(aes(label=`Sub-district`), 
             alpha=0.4) +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ll95), 
            size = 0.4, 
            colour = "grey40", 
            linetype = "dashed") +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ul95), 
            size = 0.4, 
            colour = "grey40", 
            linetype = "dashed") +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ll999), 
            size = 0.4, 
            colour = "grey40") +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ul999), 
            size = 0.4, 
            colour = "grey40") +
  geom_hline(data = dfCI, 
             aes(yintercept = fit.mean), 
             size = 0.4, 
             colour = "grey40") +
  coord_cartesian(ylim=c(0,0.05)) +
  annotate("text", x = 1, y = -0.13, label = "95%", size = 3, colour = "grey40") + 
  annotate("text", x = 4.5, y = -0.18, label = "99%", size = 3, colour = "grey40") + 
  ggtitle("Cumulative Fatality Rate by Cumulative Number of COVID-19 Cases") +
  xlab("Cumulative Number of COVID-19 Cases") + 
  ylab("Cumulative Fatality Rate") +
  theme_light() +
  theme(plot.title = element_text(size=12),
        legend.position = c(0.91,0.85), 
        legend.title = element_text(size=7),
        legend.text = element_text(size=7),
        legend.background = element_rect(colour = "grey60", linetype = "dotted"),
        legend.key.height = unit(0.3, "cm"))
p

Plot Interactive funnel plot

Use ggplotly() of plotly r package.

fp_ggplotly <- ggplotly(p,
  tooltip = c("label", 
              "x", 
              "y"))
fp_ggplotly