1. Adding the Chi-squared test

We will load BNC again today:

bnc <- read_csv("../data/bnc_written_sample.csv.bz2")
bnc
## # A tibble: 6,286,676 × 6
##    doc     sid text       hw         pos   c5   
##    <chr> <dbl> <chr>      <chr>      <chr> <chr>
##  1 BNN       1 Essex      essex      SUBST NP0  
##  2 BNN       1 by         by         PREP  PRP  
##  3 BNN       1 the        the        ART   AT0  
##  4 BNN       1 Sea        sea        SUBST NN1  
##  5 BNN       2 Dovercourt dovercourt SUBST NP0  
##  6 BNN       2 in         in         PREP  PRP  
##  7 BNN       2 the        the        ART   AT0  
##  8 BNN       2 thirties   thirties   ADJ   CRD  
##  9 BNN       2 was        be         VERB  VBD  
## 10 BNN       2 an         an         ART   AT0  
## # ℹ 6,286,666 more rows

Let’s start with the same table we had last time looking at the relationship between “good” and “boy”/“girl”. Here is the table of counts:

x11 <- sum(bnc$hw == "good" & lead(bnc$hw) == "boy", na.rm=TRUE)
x12 <- sum(bnc$hw == "good" & lead(bnc$hw) == "girl", na.rm=TRUE)
x21 <- sum(bnc$hw != "good" & lead(bnc$hw) == "boy", na.rm=TRUE)
x22 <- sum(bnc$hw != "good" & lead(bnc$hw) == "girl", na.rm=TRUE)

xtab <- matrix(c(x11, x12, x21, x22), ncol = 2, byrow = TRUE)
colnames(xtab) <- c("boy", "girl")
rownames(xtab) <- c("good", "!good")
xtab
##        boy girl
## good     4   18
## !good 1165 1201

And here are the table of expected counts under the null-hypothesis of independent:

rsum <- apply(xtab, 1, sum)
csum <- apply(xtab, 2, sum)
ecount <- rsum[row(xtab)] * csum[col(xtab)] / sum(xtab)
ecount <- matrix(ecount, ncol = 2)
dimnames(ecount) <- dimnames(xtab)
ecount
##              boy       girl
## good    10.76968   11.23032
## !good 1158.23032 1207.76968

As before, here is the G-score:

G <- -2 * sum( xtab * log(ecount / xtab))
G
## [1] 9.13715

The chi-squared score can also be computed using the same values:

CS <- sum( (xtab - ecount)^2 / ecount)
CS
## [1] 8.413638

Notice that they are actually quite similar! They should both have the same chi-squared distribution under H0 with one degree of freedom.

2. Gendered Verbs

Now, how similar are these scores in general? Let’s check with the gender verbs test. First, we grab the terms:

wset <- c("she", "he")

temp <- bnc |>
  filter(!is.na(hw)) |>
  mutate(hw_prev = lag(hw)) |>
  filter(pos == "VERB") |>
  filter(hw_prev %in% wset) |>
  group_by(hw, hw_prev) |>
  summarize(n = n()) |>
  ungroup() |>
  pivot_wider(names_from = "hw_prev", values_from = "n", values_fill = 0)

temp$total <- temp[[2]] + temp[[3]]
temp <- temp[(temp[[2]] > 0) & (temp[[3]] > 0),]
temp <- arrange(temp, desc(total))
temp
## # A tibble: 802 × 4
##    hw       he   she total
##    <chr> <int> <int> <int>
##  1 be     6645  3086  9731
##  2 have   4686  2501  7187
##  3 say    2587  1072  3659
##  4 would  1229   689  1918
##  5 do     1108   567  1675
##  6 could   964   605  1569
##  7 will    663   230   893
##  8 know    455   338   793
##  9 think   429   239   668
## 10 look    380   238   618
## # ℹ 792 more rows

And now we compute the G-scores and chi-squared scores:

df <- tibble(
  word = temp$hw[seq(1, min(200, nrow(temp)))],
  category = NA,
  gscore = NA,
  pval_g = NA,
  chisq = NA,
  pval_c = NA
)

for (j in seq_along(df$word))
{
  w <- temp$hw[j]

  # create the observed counts
  x11 <- sum(bnc$hw == w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x12 <- sum(bnc$hw == w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  x21 <- sum(bnc$hw != w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x22 <- sum(bnc$hw != w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  xtab <- matrix(c(x11, x12, x21, x22), ncol = 2, byrow = TRUE)

  # create expected counts
  rsum <- apply(xtab, 1, sum)
  csum <- apply(xtab, 2, sum)
  ecount <- rsum[row(xtab)] * (csum[col(xtab)] / sum(xtab))
  ecount <- matrix(ecount, ncol = 2)

  # find dom. category, g-score, p-value
  df$category[j] <- if_else(xtab[1] > ecount[1], wset[1], wset[2])
  df$gscore[j] <- -2 * sum( xtab * log(ecount / xtab))
  df$pval_g[j] <- 1 - pchisq(df$gscore[j], df = 1)
  df$chisq[j] <- sum( (xtab - ecount)^2 / ecount )
  df$pval_c[j] <- 1 - pchisq(df$chisq[j], df = 1)
}
df <- filter(df, !is.na(pval_g))

Here are the terms associated with “she”. Notice that the chi-squared values are also mostly ordered as well.

df |>
  filter(category == wset[1]) |>
  arrange(desc(gscore))
## # A tibble: 104 × 6
##    word    category gscore   pval_g chisq   pval_c
##    <chr>   <chr>     <dbl>    <dbl> <dbl>    <dbl>
##  1 feel    she        95.9 0        104.  0       
##  2 know    she        39.9 2.69e-10  41.8 9.91e-11
##  3 face    she        35.3 2.80e- 9  37.1 1.10e- 9
##  4 could   she        31.6 1.92e- 8  32.6 1.13e- 8
##  5 suppose she        31.2 2.38e- 8  34.1 5.31e- 9
##  6 whisper she        29.4 5.85e- 8  32.2 1.37e- 8
##  7 have    she        26.1 3.20e- 7  26.5 2.67e- 7
##  8 cry     she        26.1 3.26e- 7  28.5 9.26e- 8
##  9 hear    she        26.0 3.43e- 7  28.0 1.20e- 7
## 10 turn    she        20.3 6.67e- 6  21.3 3.89e- 6
## # ℹ 94 more rows

And, similarly, for the “he” case:

df |>
  filter(category == wset[2]) |>
  arrange(desc(gscore))
## # A tibble: 96 × 6
##    word  category gscore   pval_g chisq        pval_c
##    <chr> <chr>     <dbl>    <dbl> <dbl>         <dbl>
##  1 may   he         43.1 5.28e-11  37.2 0.00000000106
##  2 argue he         37.0 1.15e- 9  29.1 0.0000000689 
##  3 claim he         32.6 1.15e- 8  25.6 0.000000427  
##  4 write he         26.4 2.85e- 7  22.3 0.00000229   
##  5 can   he         25.9 3.66e- 7  24.1 0.000000896  
##  6 play  he         25.5 4.48e- 7  20.9 0.00000473   
##  7 win   he         19.5 1.01e- 5  16.2 0.0000566    
##  8 join  he         15.5 8.30e- 5  12.3 0.000445     
##  9 will  he         15.0 1.07e- 4  14.5 0.000142     
## 10 warn  he         14.5 1.44e- 4  10.4 0.00127      
## # ℹ 86 more rows

And, in fact, looking at a plot of the two scores, we see that (despite having somewhat different forms), the two scores are highly correlated:

df |>
  ggplot(aes(gscore, chisq)) +
    geom_point() +
    scale_x_log10() +
    scale_y_log10()

So, in fact, there really isn’t a huge different based on which test we use. I still prefer the G-test because it makes more intuitive sense, particularly when the null hypothesis is not true and we are using it a measurement of signal strength.

3. Multiple Hypothesis Testing

There is one issue with all of our analyses so far. I hope we can discuss this more later in the semester, but may run out of time, so let’s quickly see the issue here. The chi-squared scores and G-scores are all fine to use as measurements of effect sizes across many different combinations of words. However, the definitions of the p-values (that they have a certain distribution under H0) is only valid when we are looking at a single p-value. If we do a lot of tests and cherry-pick the best p-values, the results are no longer valid. Think about it this way: we know that under H0 we will get a p-value less than 0.05 about 1 in 20 times that we run an experiment. So, if we test 20 things, even if none are in fact significant, we would expected about one of these on average to have a “significant” result. Thankfully, there is a way to correct for this by adjusting the p-values by a factor. The simple, but overly conservative approach, is just to multiple the p-values by the number of tests. A better technique is called the Holm-Bonferroni method. We can apply this in R using the function p.adjust, as follows:

df |>
  mutate(
    pval_g = p.adjust(pval_g),
    pval_c = p.adjust(pval_c)
  ) |>
  arrange(pval_g)
## # A tibble: 200 × 6
##    word    category gscore       pval_g chisq       pval_c
##    <chr>   <chr>     <dbl>        <dbl> <dbl>        <dbl>
##  1 feel    she        95.9 0            104.  0           
##  2 may     he         43.1 0.0000000105  37.2 0.000000211 
##  3 know    she        39.9 0.0000000533  41.8 0.0000000197
##  4 argue   he         37.0 0.000000227   29.1 0.0000133   
##  5 face    she        35.3 0.000000549   37.1 0.000000217 
##  6 claim   he         32.6 0.00000224    25.6 0.0000807   
##  7 could   she        31.6 0.00000372    32.6 0.00000220  
##  8 suppose she        31.2 0.00000459    34.1 0.00000104  
##  9 whisper she        29.4 0.0000112     32.2 0.00000265  
## 10 write   he         26.4 0.0000544     22.3 0.000429    
## # ℹ 190 more rows

Now, all the p-values are valid, even if we cherry-pick the strongest relationships. Are some of the tests still significant even after the adjustment?

4. Zipf’s Law

There is a well-known result in linguistics research called Zipf’s Law, or in the more general case, the Zipf–Mandelbrot law. This roughly says that the frequency of the most common words in a corpus should be proportional to the rank of the words in the corpus. So, in other words, the most common word is about twice as frequent as the second most common word, three times as common as the third most common word, four times as common as the fourth most common word, and so forth. Let’s see if we can test if the BNC follows this law. We start with grabing the 100 most common terms:

temp <- bnc |>
  mutate(text = tolower(text)) |>
  filter(!is.na(hw)) |>
  group_by(text) |>
  summarize(x = n()) |>
  arrange(desc(x)) |>
  slice_head(n = 100)

temp 
## # A tibble: 100 × 2
##    text       x
##    <chr>  <int>
##  1 the   365006
##  2 of    171338
##  3 to    149698
##  4 and   145583
##  5 a     124507
##  6 in    107554
##  7 that   57450
##  8 is     54527
##  9 was    53312
## 10 it     52509
## # ℹ 90 more rows

To determine the expected probabilities, we need to determine the two unknown parameters in the Zipf–Mandelbrot law. I used another R package that I did not have you install to determine that the values of these parameters are about 0.7 and 1.0. So, here are the p-tilde values that define the best model under the null-hypothesis that the data follow this law:

ptilde <- 1 / (seq_len(100) + 0.7)^1
ptilde <- ptilde / sum(ptilde)
ptilde
##   [1] 0.133430410 0.084011740 0.061305864 0.048262063 0.039795035
##   [6] 0.033855477 0.029458662 0.026072609 0.023384711 0.021199224
##  [11] 0.019387325 0.017860764 0.016557058 0.015430728 0.014447879
##  [16] 0.013582736 0.012815350 0.012130037 0.011514299 0.010958053
##  [21] 0.010453074 0.009992586 0.009570958 0.009183470 0.008826136
##  [26] 0.008495569 0.008188870 0.007903543 0.007637431 0.007388655
##  [31] 0.007155574 0.006936749 0.006730911 0.006536937 0.006353829
##  [36] 0.006180700 0.006016756 0.005861284 0.005713645 0.005573260
##  [41] 0.005439609 0.005312218 0.005190657 0.005074535 0.004963494
##  [46] 0.004857210 0.004755381 0.004657735 0.004564018 0.004473998
##  [51] 0.004387460 0.004304207 0.004224054 0.004146832 0.004072382
##  [56] 0.004000559 0.003931225 0.003864254 0.003799526 0.003736931
##  [61] 0.003676365 0.003617730 0.003560937 0.003505899 0.003452537
##  [66] 0.003400775 0.003350542 0.003301771 0.003254400 0.003208369
##  [71] 0.003163622 0.003120106 0.003077771 0.003036569 0.002996456
##  [76] 0.002957388 0.002919327 0.002882232 0.002846069 0.002810802
##  [81] 0.002776398 0.002742826 0.002710056 0.002678060 0.002646811
##  [86] 0.002616283 0.002586450 0.002557291 0.002528781 0.002500901
##  [91] 0.002473628 0.002446944 0.002420829 0.002395266 0.002370237
##  [96] 0.002345726 0.002321716 0.002298193 0.002275142 0.002252549

Now, we get the expected counts:

ecount <- ptilde * sum(temp$x)
ecount
##   [1] 344475.288 216891.848 158272.430 124597.445 102738.244  87404.178
##   [7]  76052.986  67311.263  60371.958  54729.719  50051.965  46110.865
##  [13]  42745.109  39837.278  37299.872  35066.347  33085.197  31315.935
##  [19]  29726.294  28290.241  26986.543  25797.709  24709.198  23708.825
##  [25]  22786.303  21932.883  21141.083  20404.460  19717.441  19075.179
##  [31]  18473.438  17908.501  17377.092  16876.311  16403.585  15956.621
##  [37]  15533.368  15131.989  14750.831  14388.403  14043.357  13714.473
##  [43]  13400.640  13100.850  12814.179  12539.786  12276.897  12024.805
##  [49]  11782.857  11550.453  11327.040  11112.106  10905.177  10705.813
##  [55]  10513.608  10328.183  10149.185   9976.286   9809.179   9647.578
##  [61]   9491.215   9339.840   9193.218   9051.128   8913.364   8779.730
##  [67]   8650.044   8524.134   8401.836   8282.998   8167.475   8055.131
##  [73]   7945.834   7839.464   7735.905   7635.045   7536.782   7441.016
##  [79]   7347.654   7256.605   7167.784   7081.112   6996.511   6913.908
##  [85]   6833.232   6754.417   6677.400   6602.119   6528.517   6456.538
##  [91]   6386.129   6317.238   6249.818   6183.822   6119.206   6055.925
##  [97]   5993.941   5933.212   5873.701   5815.372

And compute the G-score:

G <- -2 * sum( temp$x * log(ecount / temp$x))
G
## [1] 42057.04

And the chi-squared score:

C <- sum((temp$x - ecount)^2 / ecount)
C
## [1] 41610.26

These distributions should be a chi-squared with 100 - 3 degrees of freedom (one lost by the final probability and the others lost by the two parameters estimated in the Zipf–Mandelbrot law).

1 - pchisq(G, df = 100 - 2)
## [1] 0

So, the p-value is very small and the G-score/chi-squared value are both very large. Does this mean the Zipf–Mandelbrot law is false? Well, sort-of. Let’s plot the expected counts against the observed counts:

temp$ecount <- ecount
temp |>
  ggplot(aes(x, ecount)) +
    geom_point(color = "grey50") +
    geom_abline(color = "olivedrab", linewidth = 1) +
    geom_text_repel(aes(label = text), data = temp[c(1:25, 100),]) +
    labs(x = "Observed Count", y = "Expected Count (Zipfs Law)") +
    scale_x_log10() +
    scale_y_log10()

You can see that the “law” is not perfect, but is far from completely invalid. In fact, as a two-parameter model for describing 100 word counts, it actually works really well!

Generally with real data, if we have enough observations we will reject almost any null-hypothesis (remember the common phrase that “all models are wrong”?). We should really look closer though to figure out if the effect sizes are meaningful. We will talk more about these issues in the third and final part of the course as we think about the philosophical and methodological implications of the techniques we have been developing.