ENV221 L06

Author

Peng Zhao

1 Overview of the module and R

2 R Basic Operations

3 R Programming

4 Statistical Graphs

5 Basic Concepts

6 Descriptive statistics

6.1 Learning objectives

In this lecture, you will

know how to describe categorical data and numerical data, and
use appropriate tables and graphs for presenting categorical and numerical data.

6.2 Categorical data

Categorical data are often described with frequency/count and proportion.

6.2.1 One-way tables

x <- read.csv('data/students_env221.csv')
tb_gender <- table(x$GENDER)
tb_gender


Female   Male 
    35     22

prop.table(tb_gender)


   Female      Male 
0.6140351 0.3859649

# marginal statistics
Epi::stat.table(list(GENDER), data = x, margins = TRUE)

 ----------------- 
 GENDER   count()  
 ----------------- 
 Female        35  
 Male          22  
                   
 Total         57  
 -----------------

which could be displayed as a long version:

dtf_gender <- data.frame(tb_gender)
names(dtf_gender) <- c('Gender', 'Frequency')

Gender	Frequency
Female	35
Male	22

Visualization: bar plots and pie charts

# Bar plot is a common way
library(ggplot2)
ggplot(x) + 
  geom_bar(aes(y = GENDER, fill = GENDER))

# A pie chart is good if you have only a few groups.
pie(tb_gender)

# but is a disaster when you have many groups
tb_disp <- table(mtcars$disp)
pie(tb_disp, col = rainbow(length(tb_disp)))
# especially a 3-D pie chart.
library(plotrix)
pie3D(tb_disp)
# instead, use a bar plot

6.2.2 Two-way tables

Contingency tables/cross-tabulation

table(x$GENDER, x$HOME)

        
          North China North China Other places South China
  Female            1          11            1          22
  Male              0           8            2          12

unique(x$HOME)

[1] "South China"  "North China"  "Other places" " North China"

x$HOME[x$HOME == ' North China'] <- 'North China'
table(x$GENDER, x$HOME)

        
         North China Other places South China
  Female          12            1          22
  Male             8            2          12

tb2 <- table(x$GENDER, x$HOME)
prop.table(tb2)

        
         North China Other places South China
  Female  0.21052632   0.01754386  0.38596491
  Male    0.14035088   0.03508772  0.21052632

round(prop.table(tb2), 3)

        
         North China Other places South China
  Female       0.211        0.018       0.386
  Male         0.140        0.035       0.211

Epi::stat.table(list(GENDER, HOME), data = x, margins = TRUE)

 ----------------------------------------- 
         --------------HOME--------------- 
 GENDER     North   Other   South   Total  
            China  places   China          
 ----------------------------------------- 
 Female        12       1      22      35  
 Male           8       2      12      22  
                                           
 Total         20       3      34      57  
 -----------------------------------------

Question: How do you display it in a long version?

dtf2 <- data.frame(table(x$GENDER, x$HOME))
names(dtf2) <- c('Gender', 'Home', 'Frequency')

Gender	Home	Frequency
Female	North China	12
Male	North China	8
Female	Other places	1
Male	Other places	2
Female	South China	22
Male	South China	12

Visualization: bar plots

library(ggplot2)
ggplot(x) + geom_bar(aes(y = GENDER, fill = HOME), width = 0.2)

ggplot(x) + geom_bar(aes(y = GENDER, fill = HOME), position = 'fill', width = 0.2)

ggplot(x) + geom_bar(aes(x = GENDER, fill = HOME), position = 'dodge')

6.2.3 Three-way tables

table(x$GENDER, x$HOME, x$YEAR)

, ,  = 1998

        
         North China Other places South China
  Female           0            0           0
  Male             1            1           0

, ,  = 1999

        
         North China Other places South China
  Female           1            0           1
  Male             1            0           0

, ,  = 2000

        
         North China Other places South China
  Female           0            0           1
  Male             4            0           0

, ,  = 2001

        
         North China Other places South China
  Female           4            0           4
  Male             0            0           5

, ,  = 2002

        
         North China Other places South China
  Female           6            1          16
  Male             1            1           6

, ,  = 2022

        
         North China Other places South China
  Female           0            0           0
  Male             0            0           1

HairEyeColor

, , Sex = Male

       Eye
Hair    Brown Blue Hazel Green
  Black    32   11    10     3
  Brown    53   50    25    15
  Red      10   10     7     7
  Blond     3   30     5     8

, , Sex = Female

       Eye
Hair    Brown Blue Hazel Green
  Black    36    9     5     2
  Brown    66   34    29    14
  Red      16    7     7     7
  Blond     4   64     5     8

Question: How do you display it in a report/publication?

Gender		Male				Female
		Eye
		Brown	Blue	Hazel	Green	Brown	Blue	Hazel	Green
Hair	Black	32	11	10	3	36	9	5	2
	Brown	53	50	25	15	66	34	29	14
	Red	10	10	7	7	16	7	7	7
	Blond	3	30	5	8	4	64	5	8

Question: How do you display it in a long version?

More-way table:

Titanic

, , Age = Child, Survived = No

      Sex
Class  Male Female
  1st     0      0
  2nd     0      0
  3rd    35     17
  Crew    0      0

, , Age = Adult, Survived = No

      Sex
Class  Male Female
  1st   118      4
  2nd   154     13
  3rd   387     89
  Crew  670      3

, , Age = Child, Survived = Yes

      Sex
Class  Male Female
  1st     5      1
  2nd    11     13
  3rd    13     14
  Crew    0      0

, , Age = Adult, Survived = Yes

      Sex
Class  Male Female
  1st    57    140
  2nd    14     80
  3rd    75     76
  Crew  192     20

Question: How do you display it in a long version?

Visualization: bar plots and mosaic plot

ggplot(x) + geom_bar(aes(x = GENDER, fill = HOME)) + 
  facet_wrap( ~ YEAR)

ggplot(data.frame(HairEyeColor)) + 
  geom_bar(aes(y = Freq, x = Sex, fill = Sex), stat = 'identity') +
  facet_grid(Hair ~ Eye)

mosaicplot(Titanic, color = TRUE)

6.3 Numerical data

6.3.1 Describe the center

Mean

Arithmetic Mean: \(\mu = \Sigma x_i / N\), \(\bar x = \Sigma x_i / n\).
Geometric Mean: \(G=\sqrt[n] {\Pi x_i}\)
Harmonic Mean: \(H = n/\Sigma \frac{1}{x_i}\)

Example:

The daily temperatures for last week were \(t_1 = 27, t_2 = 26, t_3 = 30, t_4 = 27, t_5 = 29, t_6 = 30, t_7 = 25 ^\circ\)C. What was the mean temperature?

(27+ 26 + 30 + 27 + 29 + 30 + 25) / 7
tc <- c(27, 26, 30, 27, 29, 30, 25)
sum(tc)/length(tc)
mean(tc)

Median:: Middle value of an ordered set. Also called \(Q_2\) and \(P_{50}\).

If n is odd, the median is the \((n+1)/2\) -th largest observation
If n is even, the median is the mean of the \(n/2\) -th and the \((n+1)/2\) -th largest observation

Examples:

What is the median of the sepal length in the iris dataset?

mean(sort(iris$Sepal.Length)[75:76])

median(iris$Sepal.Length)

You wanted to know the surface water temperature of the Taihu Lake. You randomly selected four locations at 11:59 PM on 1/1/2021, and got the following temperatures: 3.0, 2.1, 3.0 and 24 \(^\circ\)C. Find the measures of the center with the mean and median.

tc <- c(3.0, 2.1, 3.0, 24)
mean(tc)
median(tc)

Mean: the best way to estimate a subject of a population
Median: the safest.
Both are insufficient and unsatisfactory because of the lack of the spread.

6.3.2 Describe the spread

Generic Measure: Range: \(x_\text{max} - x_\text{min}\)

Example: What is the range of the sepal length in the iris dataset?

max(iris$Sepal.Length) - min(iris$Sepal.Length)

[1] 3.6

range(iris$Sepal.Length)

[1] 4.3 7.9

Paired with the mean: Standard deviation

Variance: for a population, \(\sigma^2 = \frac{\Sigma (x_i - \mu)^2}{N}\); for a sample, \(s^2 = \frac{\Sigma (x_i - \bar x)^2}{n-1}\)

Standard deviation: \(\sigma\), \(s\)

Coefficient of variation: \(c_v = \sigma / \mu\), \(c_v = s / \bar x\)

Example: what are the standard deviation, the variance, and coefficient of variation for the sepal length in the iris dataset?

x <- iris$Sepal.Length
n <- length(x)
x_bar <- mean(x)
x_variance <- sum((x - x_bar) ^ 2) / (n - 1)
x_sd <- sqrt(x_variance)
x_cv <- x_sd/x_bar
  
# use the built-in functions:
var(x)
sd(x)

Paired with the median: Quartiles/Percentiles

The p-th percentile: the value \(V_p\) such that p% of the sample points are less than or equal to \(V_p\).

Three quartiles:\(Q_1 = P_{25}\), \(Q_2 = P_{50} = \mathrm {median}\), \(Q_3 = P_{75}\).

Interquartile range (IQR): \(\text{IQR} = Q_3 − Q_1 = P_{75} − P_{25}\)

Example: Calculate \(Q_1, Q_2, Q_3\), and IQR of the sepal length in the iris dataset.

x <- iris$Sepal.Length
q1 <- sort(x)[round(150 * 0.25)]
q2 <- median(x)
q3 <- sort(x)[round(150 * 0.75)]
quantile(x, probs = c(0.25, 0.5, 0.75))
IQR(x)
fivenum(x)
summary(x)

6.3.3 Visualizing the center and the spread

Frequency distribution table

Example: Sepal length in iris

Class	Frequency	Midpoint	Relative_frequency	Cumulative_frequency
4.1 - 4.5	5	4.3	0.033	0.033
4.6 - 5	27	4.8	0.180	0.213
5.1 - 5.5	27	5.3	0.180	0.393
5.6 - 6	30	5.8	0.200	0.593
6.1 - 6.5	31	6.3	0.207	0.800
6.6 - 7	18	6.8	0.120	0.920
7.1 - 7.5	6	7.3	0.040	0.960
7.6 - 8	6	7.8	0.040	1.000

Question: How to make this table?

freq <- hist(iris$Sepal.Length, plot = FALSE)
dtf_freq <- data.frame(Class = paste0(freq$breaks[-length(freq$breaks)] + 0.1, ' - ', freq$breaks[-1]),
                       Frequency = freq$counts,
                       Midpoint = freq$mids + 0.05)
dtf_freq$Relative_frequency <- round(dtf_freq$Frequency / length(iris$Sepal.Length), 3)
dtf_freq$Cumulative_frequency <- cumsum(dtf_freq$Relative_frequency)
dtf_freq

Histogram:: display numerical data by indicating the number of data points that lie within a range of values. These ranges of values are called classes or bins.

Example: Histogram for the sepal length of the iris dataset.

hist(iris$Sepal.Length)

ggplot(iris) + geom_histogram(aes(iris$Sepal.Length))

Question: How many bins should we use?

par(mfrow= c(2, 2), las = 1)
for (bins in c(4, 8, 16, 32)){
  hist(iris$Sepal.Length, 
       breaks = bins, 
       main = paste("breaks = ", bins), 
       xlim = c(4, 8), ylim = c(0, 60), 
       xlab = "Sepal Length (cm)")
  box()
}

Sturges’ Rule:

\[n_\text{class} = 1 + \frac{\log_{10}N}{\log_{10}2} = 1 + \log_2 N\]

1 + log10(nrow(iris)) / log10(2)

[1] 8.228819

1 + log2(nrow(iris))

[1] 8.228819

nclass.Sturges(iris$Sepal.Length)

[1] 9

nclass.scott(iris$Sepal.Length)

[1] 7

nclass.FD(iris$Sepal.Length)

[1] 8

Question: The y-scales might be different with different bins. Why?

Density curve:: Convert the frequency to density: Density = Frequency / bin-width / range

Example: Density curve for the sepal length of the iris dataset.

hist(iris$Sepal.Length, freq = FALSE)
lines(density(iris$Sepal.Length), col = "blue")

par(mfrow= c(2, 2), las = 1)
#| code-fold: true
for (bins in c(4, 8, 16, 32)){
  hist(iris$Sepal.Length, 
       freq = FALSE,
       breaks = bins, 
       main = paste("breaks = ", bins), 
       xlim = c(4, 8), 
       ylim = c(0, 0.7), 
       xlab = "Sepal Length (cm)")
  box()
  lines(density(iris$Sepal.Length), col = "blue")
}

Frequency = Density * bin-width * total frequency

Box plot (box-and-whisker plot):: Show groups of numerical data through their quartiles, with lines extending from the boxes (whiskers) indicating variability outside the upper and lower quartiles.

Example: box plot for the sepal length in the iris dataset.

Code

boxplot(iris$Sepal.Length, 
        horizontal = TRUE, 
        xlab = 'Sepal length (cm)')
points(mean(iris$Sepal.Length), 1, col = "red", pch = 16)

ggplot(iris, aes(y = Sepal.Length)) + 
  geom_boxplot() + 
  geom_jitter(aes(x = 0), color = 'lightblue') +
  stat_summary(aes(x = 0), fun = mean, color="red") +
  labs(x = '', y = 'Sepal Length (cm)') +
  theme(axis.ticks.y = element_blank(), 
        axis.text.y = element_blank()) +
  coord_flip()

Question: How do you draw a box plot for the sepal length for each species?

boxplot(iris$Sepal.Length ~ iris$Species, 
        horizontal = TRUE, las = 1,
        xlab = 'Sepal Length (cm)', ylab = '')
sepal_length_mean <- tapply(iris$Sepal.Length, iris$Species, mean)
points(sepal_length_mean, 1:3, col = "red", pch = 16)

tapply(iris$Sepal.Length, iris$Species, summary)

$setosa
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.300   4.800   5.000   5.006   5.200   5.800 

$versicolor
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.900   5.600   5.900   5.936   6.300   7.000 

$virginica
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.900   6.225   6.500   6.588   6.900   7.900

Violin plot:: a combination of a box plot and a density plot to show the distribution shape of the data

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
  geom_violin() +
  geom_boxplot(width = 0.1) +
  coord_flip()

6.3.4 Describe the shape

Skewness:: Central symmetry - No skew - Left skew - Right skew

Example: Calculate the skewness of the sepal length in the iris dataset.

library(e1071)
hist(iris$Sepal.Length)
skewness(iris$Sepal.Length)
hist(mtcars$mpg)
skewness(mtcars$mpg)

skewness < -1: much left/negative skew.
-1 < skewness < -0.5: moderately left/negative skew.
-0.5 < skewness < 0.5: approximately symmetric.
0.5 < skewness < 1: moderately right/positive skew.
1 < skewness: much right/positive skew.

Kurtosis:: Peakedness - Leptokurtic distributions: tall and slender. - Platykurtic distributions: broad and flat.

kurtosis(iris$Sepal.Length)

kurtosis < 0: platykurtic.
kurtosis = 0: mesokurtic.–> normal distribution (-7 to +7, Bryne (2010)).
kurtosis > 0: leptokurtic.

6.4 Readings

Manga Guide to Statistics - Chapter 2 and 3