1. British National Corpus (BNC)

The British National Corpus, or BNC, is a very well-known collection of texts used in linguistics research. The full set contains a set of texts that amount to 100 million words from a variety of different sources. In order to balance our ability to get some interesting results while running on most machines, I have taken a smaller sample of the dataset and provided it here for us to use. We can load the data as follows:

bnc <- read_csv("../data/bnc_written_sample.csv.bz2")
bnc
## # A tibble: 6,286,676 × 6
##    doc     sid text       hw         pos   c5   
##    <chr> <dbl> <chr>      <chr>      <chr> <chr>
##  1 BNN       1 Essex      essex      SUBST NP0  
##  2 BNN       1 by         by         PREP  PRP  
##  3 BNN       1 the        the        ART   AT0  
##  4 BNN       1 Sea        sea        SUBST NN1  
##  5 BNN       2 Dovercourt dovercourt SUBST NP0  
##  6 BNN       2 in         in         PREP  PRP  
##  7 BNN       2 the        the        ART   AT0  
##  8 BNN       2 thirties   thirties   ADJ   CRD  
##  9 BNN       2 was        be         VERB  VBD  
## 10 BNN       2 an         an         ART   AT0  
## # ℹ 6,286,666 more rows

You can see that the dataset consists of one row per word or punctuation mark, with the text running one word after another down the rows. There are a few metadata columns providing information about the corresponding parts of speech of the text.

2. Testing One Table

Let’s start by creating a contingency table that looks at each occurrence of the word “boy” or “girl” in the dataset, and tabulates whether the word right before is “good” or something else. The following code produces such a table.

x11 <- sum(bnc$hw == "good" & lead(bnc$hw) == "boy", na.rm=TRUE)
x12 <- sum(bnc$hw == "good" & lead(bnc$hw) == "girl", na.rm=TRUE)
x21 <- sum(bnc$hw != "good" & lead(bnc$hw) == "boy", na.rm=TRUE)
x22 <- sum(bnc$hw != "good" & lead(bnc$hw) == "girl", na.rm=TRUE)

xtab <- matrix(c(x11, x12, x21, x22), ncol = 2, byrow = TRUE)
colnames(xtab) <- c("boy", "girl")
rownames(xtab) <- c("good", "!good")
xtab
##        boy girl
## good     4   18
## !good 1165 1201

Make sure that the output, if not perhaps the entire code, makes sense to you. For example, there are 4 occurrences of the phrase “good boy” but 18 of the phrase “good girl”.

We can compute the expected counts under the null hypothesis that the usage of “good” is independent of whether the next word is “boy” or “girl”.

rsum <- apply(xtab, 1, sum)
csum <- apply(xtab, 2, sum)
ecount <- rsum[row(xtab)] * csum[col(xtab)] / sum(xtab)
ecount <- matrix(ecount, ncol = 2)
dimnames(ecount) <- dimnames(xtab)
ecount
##              boy       girl
## good    10.76968   11.23032
## !good 1158.23032 1207.76968

Now, computing the G score is straightforward:

G <- -2 * sum( xtab * log(ecount / xtab))
G
## [1] 9.13715

This is a 2x2 table and therefore should have 1 degree of freedom under the null hypothesis. Here is the p-value:

1 - pchisq(G, df = 1)
## [1] 0.00250471

Which indicates that there is strong evidence that the rows and columns are not independent.

3. Testing Many Tables

One of the most interesting things that we can do with linguistic contingency tables is to compute the G-scores for many combinations and then look at which scores are the largest. Let’s start by continuing the previous example, but this time we will find all of the words that are most commonly followed by “boy” or “girl”.

temp <- bnc |>
  mutate(hw_next = lead(hw)) |>
  filter(hw_next %in% c("boy", "girl")) |>
  filter(pos == "ADJ") |>
  group_by(hw, hw_next) |>
  summarize(n = n()) |>
  ungroup() |>
  pivot_wider(names_from = "hw_next", values_from = "n", values_fill = 0) |>
  mutate(total = girl + boy) |>
  arrange(desc(total))
temp
## # A tibble: 277 × 4
##    hw      girl   boy total
##    <chr>  <int> <int> <int>
##  1 little    63    63   126
##  2 young     53    16    69
##  3 old       15    23    38
##  4 this      18    13    31
##  5 other     16    10    26
##  6 small      8    17    25
##  7 that       6    17    23
##  8 two       14     9    23
##  9 good      18     4    22
## 10 new        5    12    17
## # ℹ 267 more rows

Now, we will just do the same thing we did about with “good” using each of the most frequent words in the above table. This might take a minute or two to run.

df <- tibble(
  word = temp$hw[1:50],
  category = NA,
  gscore = NA,
  pvalue = NA
)

for (j in seq_along(df$word))
{
  w <- temp$hw[j]
  
  # create the observed counts
  x11 <- sum(bnc$hw == w & lead(bnc$hw) == "boy", na.rm=TRUE)
  x12 <- sum(bnc$hw == w & lead(bnc$hw) == "girl", na.rm=TRUE)
  x21 <- sum(bnc$hw != w & lead(bnc$hw) == "boy", na.rm=TRUE)
  x22 <- sum(bnc$hw != w & lead(bnc$hw) == "girl", na.rm=TRUE)
  xtab <- matrix(c(x11, x12, x21, x22), ncol = 2, byrow = TRUE)

  # create expected counts
  rsum <- apply(xtab, 1, sum)
  csum <- apply(xtab, 2, sum)
  ecount <- rsum[row(xtab)] * csum[col(xtab)] / sum(xtab)
  ecount <- matrix(ecount, ncol = 2)

  # find dom. category, g-score, p-value
  df$category[j] <- if_else(xtab[1] > ecount[1], "boy", "girl")
  df$gscore[j] <- -2 * sum( xtab * log(ecount / xtab))
  df$pvalue[j] <- 1 - pchisq(df$gscore[j], df = 1)
}
df <- filter(df, !is.na(pvalue))

Okay, now let’s look at the terms with the high G-scores that are associated with being more likely to be paired with “girl” than “boy”:

df |>
  filter(category == "girl") |>
  arrange(desc(gscore))
## # A tibble: 20 × 4
##    word         category  gscore     pvalue
##    <chr>        <chr>      <dbl>      <dbl>
##  1 young        girl     19.9    0.00000796
##  2 good         girl      9.14   0.00250   
##  3 beautiful    girl      5.52   0.0188    
##  4 intermediate girl      5.52   0.0188    
##  5 three        girl      5.52   0.0188    
##  6 poor         girl      2.08   0.150     
##  7 pretty       girl      1.94   0.164     
##  8 christian    girl      1.81   0.179     
##  9 clever       girl      1.81   0.179     
## 10 other        girl      1.17   0.279     
## 11 darling      girl      0.966  0.326     
## 12 each         girl      0.966  0.326     
## 13 few          girl      0.966  0.326     
## 14 italian      girl      0.966  0.326     
## 15 two          girl      0.905  0.341     
## 16 this         girl      0.622  0.430     
## 17 black        girl      0.162  0.687     
## 18 naked        girl      0.162  0.687     
## 19 another      girl      0.105  0.746     
## 20 some         girl      0.0542 0.816

Likewise, here are the terms most strongly associted with “boy”:

df |>
  filter(category == "boy") |>
  arrange(desc(gscore))
## # A tibble: 23 × 4
##    word           category gscore pvalue
##    <chr>          <chr>     <dbl>  <dbl>
##  1 bad            boy        5.14 0.0234
##  2 dear           boy        4.30 0.0382
##  3 small          boy        3.74 0.0531
##  4 new            boy        3.29 0.0696
##  5 afro-caribbean boy        3.09 0.0789
##  6 old            boy        2.08 0.149 
##  7 bright         boy        2.06 0.151 
##  8 senior         boy        1.54 0.215 
##  9 that           boy        1.49 0.222 
## 10 those          boy        1.46 0.227 
## # ℹ 13 more rows

Do you see any interesting examples or patterns here? Really take a moment to think about this!

4. Near Synonyms

We are going to more-or-less repeat the idea from the previous section a few different times. First, let’s look at two words that have similar meanings: “big” and “large”. What words commonly come right after them (usually these will be the nouns they modify):

wset <- c("big", "large")

temp <- bnc |>
  filter(!is.na(hw)) |>
  mutate(hw_prev = lag(hw)) |>
  filter(pos == "SUBST") |>     # only look at nouns
  filter(hw_prev %in% wset) |>
  group_by(hw, hw_prev) |>
  summarize(n = n()) |>
  ungroup() |>
  pivot_wider(names_from = "hw_prev", values_from = "n", values_fill = 0)

temp$total <- temp[[2]] + temp[[3]]
temp <- temp[(temp[[2]] > 0) & (temp[[3]] > 0),]
temp <- arrange(temp, desc(total))
temp
## # A tibble: 150 × 4
##    hw      large   big total
##    <chr>   <int> <int> <int>
##  1 number    200     1   201
##  2 part       51     8    59
##  3 company    43    10    53
##  4 scale      33     1    34
##  5 problem     7    21    28
##  6 city       13    13    26
##  7 firm       19     7    26
##  8 fish        2    23    25
##  9 man         5    20    25
## 10 sum        24     1    25
## # ℹ 140 more rows

We will run the same type of contingency tables on the 50 most frequently associated word with each of these:

df <- tibble(
  word = temp$hw[seq(1, min(50, nrow(temp)))],
  category = NA,
  gscore = NA,
  pvalue = NA
)

for (j in seq_along(df$word))
{
  w <- temp$hw[j]

  # create the observed counts
  x11 <- sum(bnc$hw == w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x12 <- sum(bnc$hw == w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  x21 <- sum(bnc$hw != w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x22 <- sum(bnc$hw != w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  xtab <- matrix(c(x11, x12, x21, x22), ncol = 2, byrow = TRUE)

  # create expected counts
  rsum <- apply(xtab, 1, sum)
  csum <- apply(xtab, 2, sum)
  ecount <- rsum[row(xtab)] * csum[col(xtab)] / sum(xtab)
  ecount <- matrix(ecount, ncol = 2)

  # find dom. category, g-score, p-value
  df$category[j] <- if_else(xtab[1] > ecount[1], wset[1], wset[2])
  df$gscore[j] <- -2 * sum( xtab * log(ecount / xtab))
  df$pvalue[j] <- 1 - pchisq(df$gscore[j], df = 1)
}
df <- filter(df, !is.na(pvalue))

Here are the words most highly associated with “big” vs. “large”:

df |>
  filter(category == wset[1]) |>
  arrange(desc(gscore))
## # A tibble: 28 × 4
##    word     category gscore       pvalue
##    <chr>    <chr>     <dbl>        <dbl>
##  1 fish     big       31.0  0.0000000254
##  2 business big       21.5  0.00000353  
##  3 man      big       17.3  0.0000318   
##  4 problem  big       14.7  0.000126    
##  5 one      big       14.5  0.000140    
##  6 star     big       12.9  0.000322    
##  7 stage    big        9.63 0.00192     
##  8 boot     big        8.01 0.00464     
##  9 question big        6.90 0.00863     
## 10 prize    big        6.43 0.0112      
## # ℹ 18 more rows

And those most associated with “large”:

df |>
  filter(category == wset[2]) |>
  arrange(desc(gscore))
## # A tibble: 22 × 4
##    word        category gscore      pvalue
##    <chr>       <chr>     <dbl>       <dbl>
##  1 number      large    196.   0          
##  2 scale       large     26.1  0.000000324
##  3 part        large     19.5  0.0000103  
##  4 sum         large     17.6  0.0000270  
##  5 group       large     15.8  0.0000715  
##  6 size        large     10.3  0.00130    
##  7 company     large      9.94 0.00162    
##  8 measure     large      7.72 0.00547    
##  9 corporation large      7.24 0.00712    
## 10 family      large      5.75 0.0165     
## # ℹ 12 more rows

Again, really take a few minutes to study the output. Think to yourself: would you use the “other” word to modify the words with the highest G-scores? And if so, in what situations? Can you come up with a general rule that describes the top terms?

5. Gendered Verbs

Let’s move to a similar example of something I have studied quite a bit in different contexts: What verbs are associated with the pronouns “she” and “he”? Here are the terms to study:

wset <- c("she", "he")

temp <- bnc |>
  filter(!is.na(hw)) |>
  mutate(hw_prev = lag(hw)) |>
  filter(pos == "VERB") |>
  filter(hw_prev %in% wset) |>
  group_by(hw, hw_prev) |>
  summarize(n = n()) |>
  ungroup() |>
  pivot_wider(names_from = "hw_prev", values_from = "n", values_fill = 0)

temp$total <- temp[[2]] + temp[[3]]
temp <- temp[(temp[[2]] > 0) & (temp[[3]] > 0),]
temp <- arrange(temp, desc(total))
temp
## # A tibble: 802 × 4
##    hw       he   she total
##    <chr> <int> <int> <int>
##  1 be     6645  3086  9731
##  2 have   4686  2501  7187
##  3 say    2587  1072  3659
##  4 would  1229   689  1918
##  5 do     1108   567  1675
##  6 could   964   605  1569
##  7 will    663   230   893
##  8 know    455   338   793
##  9 think   429   239   668
## 10 look    380   238   618
## # ℹ 792 more rows

And now we compute the G-scores:

df <- tibble(
  word = temp$hw[seq(1, min(200, nrow(temp)))],
  category = NA,
  gscore = NA,
  pvalue = NA
)

for (j in seq_along(df$word))
{
  w <- temp$hw[j]

  # create the observed counts
  x11 <- sum(bnc$hw == w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x12 <- sum(bnc$hw == w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  x21 <- sum(bnc$hw != w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x22 <- sum(bnc$hw != w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  xtab <- matrix(c(x11, x12, x21, x22), ncol = 2, byrow = TRUE)

  # create expected counts
  rsum <- apply(xtab, 1, sum)
  csum <- apply(xtab, 2, sum)
  ecount <- rsum[row(xtab)] * (csum[col(xtab)] / sum(xtab))
  ecount <- matrix(ecount, ncol = 2)

  # find dom. category, g-score, p-value
  df$category[j] <- if_else(xtab[1] > ecount[1], wset[1], wset[2])
  df$gscore[j] <- -2 * sum( xtab * log(ecount / xtab))
  df$pvalue[j] <- 1 - pchisq(df$gscore[j], df = 1)
}
df <- filter(df, !is.na(pvalue))

Here are the terms associated with “she”:

df |>
  filter(category == wset[1]) |>
  arrange(desc(gscore))
## # A tibble: 104 × 4
##    word    category gscore   pvalue
##    <chr>   <chr>     <dbl>    <dbl>
##  1 feel    she        95.9 0       
##  2 know    she        39.9 2.69e-10
##  3 face    she        35.3 2.80e- 9
##  4 could   she        31.6 1.92e- 8
##  5 suppose she        31.2 2.38e- 8
##  6 whisper she        29.4 5.85e- 8
##  7 have    she        26.1 3.20e- 7
##  8 cry     she        26.1 3.26e- 7
##  9 hear    she        26.0 3.43e- 7
## 10 turn    she        20.3 6.67e- 6
## # ℹ 94 more rows

And now the terms associated with “he”:

df |>
  filter(category == wset[2]) |>
  arrange(desc(gscore))
## # A tibble: 96 × 4
##    word  category gscore   pvalue
##    <chr> <chr>     <dbl>    <dbl>
##  1 may   he         43.1 5.28e-11
##  2 argue he         37.0 1.15e- 9
##  3 claim he         32.6 1.15e- 8
##  4 write he         26.4 2.85e- 7
##  5 can   he         25.9 3.66e- 7
##  6 play  he         25.5 4.48e- 7
##  7 win   he         19.5 1.01e- 5
##  8 join  he         15.5 8.30e- 5
##  9 will  he         15.0 1.07e- 4
## 10 warn  he         14.5 1.44e- 4
## # ℹ 86 more rows

Do you see any general patterns? Do these map onto or cut against any gender (mis)perceptions that you might know in modern society? Can you summarize the patterns in any way?

6. Near Synonyms: Your Turn

I’ve copied the “Near Synonyms” code again below. Modify the word sets by picking two near synonym adjectives of your own choosing (try to think of some relatively common words so that there is enough data). Run all of the code and see whether the results turn up anything interesting.

wset <- c("big", "large")

temp <- bnc |>
  filter(!is.na(hw)) |>
  mutate(hw_prev = lag(hw)) |>
  filter(pos == "SUBST") |>     # only look at nouns
  filter(hw_prev %in% wset) |>
  group_by(hw, hw_prev) |>
  summarize(n = n()) |>
  ungroup() |>
  pivot_wider(names_from = "hw_prev", values_from = "n", values_fill = 0)

temp$total <- temp[[2]] + temp[[3]]
temp <- temp[(temp[[2]] > 0) & (temp[[3]] > 0),]
temp <- arrange(temp, desc(total))
temp
## # A tibble: 150 × 4
##    hw      large   big total
##    <chr>   <int> <int> <int>
##  1 number    200     1   201
##  2 part       51     8    59
##  3 company    43    10    53
##  4 scale      33     1    34
##  5 problem     7    21    28
##  6 city       13    13    26
##  7 firm       19     7    26
##  8 fish        2    23    25
##  9 man         5    20    25
## 10 sum        24     1    25
## # ℹ 140 more rows

The G-scores (should not have to change anything here):

df <- tibble(
  word = temp$hw[seq(1, min(50, nrow(temp)))],
  category = NA,
  gscore = NA,
  pvalue = NA
)

for (j in seq_along(df$word))
{
  w <- temp$hw[j]

  # create the observed counts
  x11 <- sum(bnc$hw == w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x12 <- sum(bnc$hw == w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  x21 <- sum(bnc$hw != w & lag(bnc$hw) == wset[1], na.rm=TRUE)
  x22 <- sum(bnc$hw != w & lag(bnc$hw) == wset[2], na.rm=TRUE)
  xtab <- matrix(c(x11, x12, x21, x22), ncol = 2, byrow = TRUE)

  # create expected counts
  rsum <- apply(xtab, 1, sum)
  csum <- apply(xtab, 2, sum)
  ecount <- rsum[row(xtab)] * csum[col(xtab)] / sum(xtab)
  ecount <- matrix(ecount, ncol = 2)

  # find dom. category, g-score, p-value
  df$category[j] <- if_else(xtab[1] > ecount[1], wset[1], wset[2])
  df$gscore[j] <- -2 * sum( xtab * log(ecount / xtab))
  df$pvalue[j] <- 1 - pchisq(df$gscore[j], df = 1)
}
df <- filter(df, !is.na(pvalue))

Here are the words most highly associated with your first term:

df |>
  filter(category == wset[1]) |>
  arrange(desc(gscore))
## # A tibble: 28 × 4
##    word     category gscore       pvalue
##    <chr>    <chr>     <dbl>        <dbl>
##  1 fish     big       31.0  0.0000000254
##  2 business big       21.5  0.00000353  
##  3 man      big       17.3  0.0000318   
##  4 problem  big       14.7  0.000126    
##  5 one      big       14.5  0.000140    
##  6 star     big       12.9  0.000322    
##  7 stage    big        9.63 0.00192     
##  8 boot     big        8.01 0.00464     
##  9 question big        6.90 0.00863     
## 10 prize    big        6.43 0.0112      
## # ℹ 18 more rows

And those most associated with your second term:

df |>
  filter(category == wset[2]) |>
  arrange(desc(gscore))
## # A tibble: 22 × 4
##    word        category gscore      pvalue
##    <chr>       <chr>     <dbl>       <dbl>
##  1 number      large    196.   0          
##  2 scale       large     26.1  0.000000324
##  3 part        large     19.5  0.0000103  
##  4 sum         large     17.6  0.0000270  
##  5 group       large     15.8  0.0000715  
##  6 size        large     10.3  0.00130    
##  7 company     large      9.94 0.00162    
##  8 measure     large      7.72 0.00547    
##  9 corporation large      7.24 0.00712    
## 10 family      large      5.75 0.0165     
## # ℹ 12 more rows

If you have time and the counts are small or the results are not very interesting, try to think of some different pairs of words.