library(tidyverse)
library(forcats)
library(ggrepel)
library(smodels)

theme_set(theme_minimal())
options(dplyr.summarise.inform = FALSE)
options(width = 77L)

NBA Dataset

Today we are going to look at a dataset based on attempted shots in the NBA. Specifically, using features based on when and where the shot was taken, we want to predict whether or not the shot was successful.

set.seed(1)

nba <- read_csv("data/nba_shots.csv") %>%
  mutate(train_id = if_else(runif(n()) < 0.6, "train", "valid"))
nba
## # A tibble: 20,000 x 12
##      fgm period shot_clock dribbles touch_time shot_dist pts_type
##    <dbl>  <dbl>      <dbl>    <dbl>      <dbl>     <dbl>    <dbl>
##  1     0      2       18.6        0        0.8      23.7        2
##  2     0      4       11          0        1.2      26.7        3
##  3     0      3        4.9        2        1.8       9.4        2
##  4     0      4        2          3        3.1      24.3        2
##  5     0      4        9.9        4        3.9      14.6        2
##  6     1      3       12.7        6        6.7       9.1        2
##  7     0      1        3.1        4        5.9       3.5        2
##  8     0      2       24          2        4.2      35.2        3
##  9     0      3       19.8        0        0.7      22.7        3
## 10     0      4       11.6        2        3.2       3.9        2
## # … with 19,990 more rows, and 5 more variables: close_def_dist <dbl>,
## #   shooter_height <dbl>, defender_height <dbl>, player_name <chr>,
## #   train_id <chr>

Notice that the response of interest is coded as either 0 (shot missed) or 1 (shot made).

Linear Regression

We can apply a simple linear regression to this prediction task. To start, I will use only the variables shot_clock and shot_dist.

model <- nba %>%
  filter(train_id == "train") %>%
  lm(fgm ~ shot_clock + shot_dist, data = .)

summary(model)
## 
## Call:
## lm(formula = fgm ~ shot_clock + shot_dist, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7093 -0.4633  0.2947  0.4654  0.8081 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.5538102  0.0135470  40.881   <2e-16 ***
## shot_clock   0.0066012  0.0007786   8.478   <2e-16 ***
## shot_dist   -0.0098541  0.0005091 -19.355   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4894 on 12018 degrees of freedom
## Multiple R-squared:  0.04229,    Adjusted R-squared:  0.04214 
## F-statistic: 265.4 on 2 and 12018 DF,  p-value: < 2.2e-16

Plotting the predicted values we see a reasonable pattern:

nba %>%
  mutate(fgm_pred = predict(model, newdata = .)) %>%
  ggplot(aes(shot_clock, shot_dist)) +
    geom_point(aes(color = fgm_pred)) +
    scale_color_viridis_c()