Code Examples

Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.

122 Examples
R & Python Support: All examples include both R and Python versions. Click the tabs to switch between languages. Use the copy button to copy code to clipboard.

Machine Learning

Apply ML techniques for predictions, clustering, and pattern recognition

Win Probability Model (Logistic Regression)
Build a basic win probability model using game state features.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Prepare training data
model_data <- pbp %>%
  filter(!is.na(score_differential), !is.na(wp)) %>%
  select(
    home_team, away_team, posteam,
    game_seconds_remaining, score_differential,
    down, ydstogo, yardline_100,
    posteam_timeouts_remaining, defteam_timeouts_remaining,
    receive_2h_ko,
    wp  # Target variable (use as proxy for outcome)
  ) %>%
  mutate(
    is_home = posteam == home_team,
    qtr = 4 - (game_seconds_remaining %/% 900)
  ) %>%
  drop_na()

# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]

# Fit logistic regression
model <- glm(
  wp ~ score_differential + game_seconds_remaining +
       is_home + down + ydstogo + yardline_100 +
       posteam_timeouts_remaining,
  data = train,
  family = "quasibinomial"
)

# Evaluate
test$pred_wp <- predict(model, test, type = "response")
test$correct <- (test$pred_wp > 0.5) == (test$wp > 0.5)

cat("Accuracy:", mean(test$correct) * 100, "%\n")
cat("Brier Score:", mean((test$pred_wp - test$wp)^2), "\n")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, brier_score_loss

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    pbp["score_differential"].notna() &
    pbp["wp"].notna()
][["home_team", "posteam", "game_seconds_remaining",
   "score_differential", "down", "ydstogo", "yardline_100",
   "posteam_timeouts_remaining", "wp"]].dropna()

model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)

# Features and target
features = ["score_differential", "game_seconds_remaining", "is_home",
            "down", "ydstogo", "yardline_100", "posteam_timeouts_remaining"]
X = model_data[features]
y = (model_data["wp"] > 0.5).astype(int)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Brier Score: {brier_score_loss(y_test, y_prob):.4f}")
Packages: nflfastR tidyverse nfl_data_py pandas scikit-learn
Player Clustering (K-Means)
Cluster receivers by their play style using statistical profiles.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Create receiver profiles
receiver_profiles <- pbp %>%
  filter(!is.na(receiver_player_id), play_type == "pass") %>%
  group_by(receiver_player_id, receiver_player_name) %>%
  summarize(
    targets = n(),
    catch_rate = mean(complete_pass),
    avg_depth = mean(air_yards, na.rm = TRUE),
    yac_per_rec = mean(yards_after_catch[complete_pass == 1], na.rm = TRUE),
    contested_rate = mean(defenders_in_box >= 7, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(targets >= 40) %>%
  drop_na()

# Scale features
features <- receiver_profiles %>%
  select(catch_rate, avg_depth, yac_per_rec)
scaled_features <- scale(features)

# K-means clustering
set.seed(42)
kmeans_result <- kmeans(scaled_features, centers = 4, nstart = 25)

# Add clusters to data
receiver_profiles$cluster <- factor(kmeans_result$cluster)

# Analyze clusters
cluster_summary <- receiver_profiles %>%
  group_by(cluster) %>%
  summarize(
    n_players = n(),
    avg_catch_rate = mean(catch_rate),
    avg_depth = mean(avg_depth),
    avg_yac = mean(yac_per_rec)
  )

print(cluster_summary)

# Name clusters based on characteristics
# Cluster 1: Deep threats (high depth)
# Cluster 2: Possession receivers (high catch rate)
# etc.
import nfl_data_py as nfl
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

pbp = nfl.import_pbp_data([2023])

# Create receiver profiles
pass_plays = pbp[(pbp["receiver_player_id"].notna()) &
                 (pbp["play_type"] == "pass")]

receiver_profiles = (pass_plays.groupby(
    ["receiver_player_id", "receiver_player_name"])
    .agg(
        targets=("play_id", "count"),
        catch_rate=("complete_pass", "mean"),
        avg_depth=("air_yards", "mean"),
        yac_per_rec=("yards_after_catch", lambda x: x[pass_plays.loc[x.index, "complete_pass"] == 1].mean())
    )
    .reset_index()
    .dropna())

receiver_profiles = receiver_profiles[receiver_profiles["targets"] >= 40]

# Scale features
features = receiver_profiles[["catch_rate", "avg_depth", "yac_per_rec"]]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42, n_init=25)
receiver_profiles["cluster"] = kmeans.fit_predict(scaled_features)

# Analyze clusters
cluster_summary = (receiver_profiles.groupby("cluster")
    .agg(
        n_players=("receiver_player_name", "count"),
        avg_catch_rate=("catch_rate", "mean"),
        avg_depth=("avg_depth", "mean"),
        avg_yac=("yac_per_rec", "mean")
    )
    .reset_index())

print(cluster_summary)
Packages: nflfastR tidyverse nfl_data_py pandas scikit-learn
Random Forest Play Prediction
Predict play type (pass vs run) using game state features.
Advanced
library(nflfastR)
library(tidyverse)
library(randomForest)

pbp <- load_pbp(2023)

# Prepare data for play prediction
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
  mutate(
    is_pass = as.factor(play_type == "pass"),
    is_home = posteam == home_team
  ) %>%
  select(is_pass, down, ydstogo, yardline_100, score_differential,
         game_seconds_remaining, is_home) %>%
  drop_na()

# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]

# Train random forest
rf_model <- randomForest(
  is_pass ~ .,
  data = train,
  ntree = 100,
  importance = TRUE
)

# Evaluate
test$pred <- predict(rf_model, test)
accuracy <- mean(test$pred == test$is_pass)
cat("Accuracy:", accuracy * 100, "%\n")

# Feature importance
importance(rf_model) %>% as.data.frame() %>%
  rownames_to_column("feature") %>%
  arrange(desc(MeanDecreaseGini)) %>%
  print()
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)

features = ["down", "ydstogo", "yardline_100", "score_differential",
            "game_seconds_remaining", "is_home"]
model_data = model_data[features + ["is_pass"]].dropna()

# Split data
X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
importance = pd.DataFrame({
    "feature": features,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)
print(importance)
Packages: nflfastR tidyverse randomForest nfl_data_py pandas scikit-learn
XGBoost Win Prediction
Build an XGBoost model to predict game outcomes.
Advanced
library(nflfastR)
library(tidyverse)
library(xgboost)

schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)

# Calculate team metrics per season
team_metrics <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(season, posteam) %>%
  summarize(
    off_epa = mean(epa),
    pass_rate = mean(play_type == "pass"),
    .groups = "drop"
  ) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
      group_by(season, defteam) %>%
      summarize(def_epa = mean(epa), .groups = "drop"),
    by = c("season", "posteam" = "defteam")
  )

# Create game features
games <- schedules %>%
  filter(!is.na(result)) %>%
  left_join(team_metrics, by = c("season", "home_team" = "posteam")) %>%
  rename(home_off = off_epa, home_def = def_epa, home_pass = pass_rate) %>%
  left_join(team_metrics, by = c("season", "away_team" = "posteam")) %>%
  rename(away_off = off_epa, away_def = def_epa, away_pass = pass_rate) %>%
  mutate(home_win = as.numeric(result > 0)) %>%
  select(home_off, home_def, home_pass, away_off, away_def, away_pass, home_win) %>%
  drop_na()

# Prepare for XGBoost
train_matrix <- as.matrix(games %>% select(-home_win))
labels <- games$home_win

# Train XGBoost
xgb_model <- xgboost(
  data = train_matrix,
  label = labels,
  nrounds = 100,
  objective = "binary:logistic",
  verbose = 0
)

# Feature importance
importance <- xgb.importance(model = xgb_model)
print(importance)
import nfl_data_py as nfl
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])

# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

team_off = (plays.groupby(["season", "posteam"])
    .agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
    .reset_index())

team_def = (plays.groupby(["season", "defteam"])
    .agg(def_epa=("epa", "mean"))
    .reset_index())

team_metrics = team_off.merge(team_def, left_on=["season", "posteam"],
                               right_on=["season", "defteam"])

# Create game features
games = schedules[schedules["result"].notna()].copy()
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
                    left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def", "pass_rate": "home_pass"})
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
                    left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def", "pass_rate": "away_pass"})

games["home_win"] = (games["result"] > 0).astype(int)
features = ["home_off", "home_def", "home_pass", "away_off", "away_def", "away_pass"]
games = games[features + ["home_win"]].dropna()

# Train XGBoost
X_train, X_test, y_train, y_test = train_test_split(
    games[features], games["home_win"], test_size=0.2, random_state=42)

model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
print(pd.DataFrame({"feature": features, "importance": model.feature_importances_}).sort_values("importance", ascending=False))
Packages: nflfastR tidyverse xgboost nfl_data_py pandas
Feature Engineering for NFL
Create advanced features for NFL prediction models.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Create advanced features
features <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  mutate(
    # Situational features
    is_neutral_game_script = abs(score_differential) <= 7,
    is_garbage_time = (qtr == 4 & abs(score_differential) > 21) |
                      (qtr >= 4 & game_seconds_remaining < 120 & abs(score_differential) > 14),
    is_two_minute = game_seconds_remaining <= 120 & qtr %in% c(2, 4),
    is_redzone = yardline_100 <= 20,
    is_goal_to_go = goal_to_go == 1,
    is_third_or_fourth = down >= 3,

    # Distance buckets
    distance_bucket = case_when(
      ydstogo <= 3 ~ "short",
      ydstogo <= 7 ~ "medium",
      ydstogo <= 10 ~ "long",
      TRUE ~ "very_long"
    ),

    # Field position buckets
    field_bucket = case_when(
      yardline_100 >= 80 ~ "own_deep",
      yardline_100 >= 50 ~ "own_side",
      yardline_100 >= 20 ~ "opponent",
      TRUE ~ "redzone"
    ),

    # Time features
    game_pct_complete = 1 - (game_seconds_remaining / 3600),
    half = if_else(qtr <= 2, 1, 2),

    # Rolling metrics (would need window function)
    pass_heavy = NA  # Placeholder for rolling pass rate
  )

# Show feature distribution
features %>%
  group_by(distance_bucket, field_bucket) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass"),
    avg_epa = mean(epa),
    .groups = "drop"
  ) %>%
  print()
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Create advanced features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))].copy()

# Situational features
plays["is_neutral"] = plays["score_differential"].abs() <= 7
plays["is_garbage_time"] = (
    ((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21)) |
    ((plays["qtr"] >= 4) & (plays["game_seconds_remaining"] < 120) &
     (plays["score_differential"].abs() > 14))
)
plays["is_two_minute"] = (plays["game_seconds_remaining"] <= 120) & (plays["qtr"].isin([2, 4]))
plays["is_redzone"] = plays["yardline_100"] <= 20
plays["is_third_or_fourth"] = plays["down"] >= 3

# Distance buckets
def distance_bucket(yd):
    if yd <= 3: return "short"
    elif yd <= 7: return "medium"
    elif yd <= 10: return "long"
    else: return "very_long"

plays["distance_bucket"] = plays["ydstogo"].apply(distance_bucket)

# Field position buckets
def field_bucket(yd):
    if yd >= 80: return "own_deep"
    elif yd >= 50: return "own_side"
    elif yd >= 20: return "opponent"
    else: return "redzone"

plays["field_bucket"] = plays["yardline_100"].apply(field_bucket)

# Time features
plays["game_pct_complete"] = 1 - (plays["game_seconds_remaining"] / 3600)
plays["half"] = np.where(plays["qtr"] <= 2, 1, 2)

# Show feature distribution
feature_dist = (plays.groupby(["distance_bucket", "field_bucket"])
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean()),
        avg_epa=("epa", "mean")
    )
    .reset_index())

print(feature_dist)
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Cross-Validation Strategies
Implement proper cross-validation for NFL prediction models.
Advanced
library(nflfastR)
library(tidyverse)
library(caret)

pbp <- load_pbp(2020:2023)

# Prepare data
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down), !is.na(epa)) %>%
  mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE))) %>%
  select(season, week, is_pass, down, ydstogo, yardline_100,
         score_differential, game_seconds_remaining) %>%
  drop_na()

# Time-series cross-validation (train on past, test on future)
results <- list()
for (test_year in 2021:2023) {
  train <- model_data %>% filter(season < test_year)
  test <- model_data %>% filter(season == test_year)

  model <- glm(is_pass ~ down + ydstogo + yardline_100 +
                         score_differential + game_seconds_remaining,
               data = train, family = "binomial")

  preds <- predict(model, test, type = "response")
  pred_class <- ifelse(preds > 0.5, TRUE, FALSE)
  accuracy <- mean(pred_class == test$is_pass)

  results[[as.character(test_year)]] <- accuracy
  cat("Test Year:", test_year, "- Accuracy:", round(accuracy * 100, 1), "%\n")
}

# Average performance
cat("\nAverage Accuracy:", round(mean(unlist(results)) * 100, 1), "%\n")

# Week-by-week validation (within season)
# This accounts for within-season learning
week_results <- model_data %>%
  filter(season == 2023) %>%
  group_by(week) %>%
  group_split() %>%
  map_dbl(function(test_week) {
    w <- unique(test_week$week)
    train <- model_data %>% filter(season == 2023, week < w)
    if (nrow(train) < 100) return(NA)

    model <- glm(is_pass ~ ., data = train %>% select(-season, -week),
                 family = "binomial")
    preds <- predict(model, test_week, type = "response") > 0.5
    mean(preds == test_week$is_pass)
  })

cat("\nWeek-by-Week Validation (2023):")
print(week_results, na.print = "N/A")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna()) &
    (pbp["epa"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[["season", "week"] + features + ["is_pass"]].dropna()

# Time-series cross-validation (train on past, test on future)
results = []
for test_year in [2021, 2022, 2023]:
    train = model_data[model_data["season"] < test_year]
    test = model_data[model_data["season"] == test_year]

    X_train, y_train = train[features], train["is_pass"]
    X_test, y_test = test[features], test["is_pass"]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    results.append(accuracy)
    print(f"Test Year: {test_year} - Accuracy: {accuracy * 100:.1f}%")

print(f"\nAverage Accuracy: {sum(results) / len(results) * 100:.1f}%")

# Alternative: sklearn TimeSeriesSplit
print("\nTimeSeriesSplit (3-fold):")
tscv = TimeSeriesSplit(n_splits=3)
X = model_data[features]
y = model_data["is_pass"]

for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    print(f"Fold {i+1}: {acc * 100:.1f}%")
Packages: nflfastR tidyverse caret nfl_data_py pandas scikit-learn
Model Evaluation Metrics
Evaluate prediction models with proper metrics for NFL data.
Advanced
library(nflfastR)
library(tidyverse)
library(pROC)

pbp <- load_pbp(2023)
schedules <- load_schedules(2023)

# Build a simple win probability prediction
model_data <- pbp %>%
  filter(!is.na(wp), !is.na(score_differential)) %>%
  select(game_id, play_id, wp, score_differential,
         game_seconds_remaining, posteam) %>%
  drop_na()

# Create actual outcome
game_outcomes <- schedules %>%
  filter(!is.na(result)) %>%
  mutate(home_win = result > 0) %>%
  select(game_id, home_team, home_win)

model_data <- model_data %>%
  left_join(game_outcomes, by = "game_id") %>%
  mutate(actual_win = (posteam == home_team & home_win) |
                      (posteam != home_team & !home_win))

# Simple model based on score differential and time
model <- glm(actual_win ~ score_differential + game_seconds_remaining,
             data = model_data, family = "binomial")

model_data$pred_wp <- predict(model, type = "response")

# Evaluation metrics
# 1. Brier Score (lower is better)
brier <- mean((model_data$pred_wp - model_data$actual_win)^2)
cat("Brier Score:", round(brier, 4), "\n")

# 2. Log Loss
log_loss <- -mean(model_data$actual_win * log(model_data$pred_wp + 1e-10) +
                  (1 - model_data$actual_win) * log(1 - model_data$pred_wp + 1e-10))
cat("Log Loss:", round(log_loss, 4), "\n")

# 3. AUC-ROC
roc_obj <- roc(model_data$actual_win, model_data$pred_wp, quiet = TRUE)
cat("AUC-ROC:", round(auc(roc_obj), 4), "\n")

# 4. Calibration by probability bucket
calibration <- model_data %>%
  mutate(wp_bucket = cut(pred_wp, breaks = seq(0, 1, 0.1))) %>%
  group_by(wp_bucket) %>%
  summarize(
    n = n(),
    predicted = mean(pred_wp),
    actual = mean(actual_win)
  )

print("\nCalibration by Probability Bucket:")
print(calibration)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (brier_score_loss, log_loss, roc_auc_score,
                             accuracy_score, confusion_matrix)

pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])

# Prepare data
model_data = pbp[
    (pbp["wp"].notna()) &
    (pbp["score_differential"].notna())
][["game_id", "wp", "score_differential", "game_seconds_remaining", "posteam"]].dropna()

# Get actual outcomes
outcomes = schedules[schedules["result"].notna()][["game_id", "home_team", "result"]].copy()
outcomes["home_win"] = outcomes["result"] > 0

model_data = model_data.merge(outcomes, on="game_id")
model_data["actual_win"] = (
    ((model_data["posteam"] == model_data["home_team"]) & model_data["home_win"]) |
    ((model_data["posteam"] != model_data["home_team"]) & ~model_data["home_win"])
).astype(int)

# Simple model
X = model_data[["score_differential", "game_seconds_remaining"]]
y = model_data["actual_win"]

model = LogisticRegression(max_iter=1000)
model.fit(X, y)
pred_wp = model.predict_proba(X)[:, 1]

# Evaluation metrics
print("Model Evaluation Metrics:")
print(f"Brier Score: {brier_score_loss(y, pred_wp):.4f}")
print(f"Log Loss: {log_loss(y, pred_wp):.4f}")
print(f"AUC-ROC: {roc_auc_score(y, pred_wp):.4f}")
print(f"Accuracy: {accuracy_score(y, (pred_wp > 0.5).astype(int)) * 100:.1f}%")

# Calibration
model_data["pred_wp"] = pred_wp
model_data["wp_bucket"] = pd.cut(pred_wp, bins=np.arange(0, 1.1, 0.1))

calibration = (model_data.groupby("wp_bucket")
    .agg(n=("actual_win", "count"),
         predicted=("pred_wp", "mean"),
         actual=("actual_win", "mean"))
    .reset_index())

print("\nCalibration by Probability Bucket:")
print(calibration)
Packages: nflfastR tidyverse pROC nfl_data_py pandas scikit-learn
Hyperparameter Tuning
Optimize model hyperparameters using grid search and cross-validation.
Advanced
library(nflfastR)
library(tidyverse)
library(caret)

pbp <- load_pbp(2023)

# Prepare data
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
  mutate(is_pass = factor(play_type == "pass")) %>%
  select(is_pass, down, ydstogo, yardline_100,
         score_differential, game_seconds_remaining) %>%
  drop_na() %>%
  sample_n(min(10000, n()))  # Subsample for speed

# Define hyperparameter grid for Random Forest
rf_grid <- expand.grid(
  mtry = c(2, 3, 4, 5)
)

# Cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

# Train with grid search
set.seed(42)
rf_tuned <- train(
  is_pass ~ .,
  data = model_data,
  method = "rf",
  trControl = train_control,
  tuneGrid = rf_grid,
  metric = "ROC",
  ntree = 50
)

# Results
print(rf_tuned)
print(rf_tuned$bestTune)

# Plot tuning results
plot(rf_tuned)

# Best model performance
confusionMatrix(rf_tuned)
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()

# Subsample for speed
model_data = model_data.sample(n=min(10000, len(model_data)), random_state=42)
X = model_data[features]
y = model_data["is_pass"]

# Define hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Smaller grid for faster demo
small_grid = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5]
}

# Grid search with cross-validation
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    rf,
    small_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

# Results
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score (AUC): {grid_search.best_score_:.4f}")

# All results
results = pd.DataFrame(grid_search.cv_results_)
print("\nTop 5 Parameter Combinations:")
print(results.nsmallest(5, "rank_test_score")[["params", "mean_test_score", "std_test_score"]])
Packages: nflfastR tidyverse caret nfl_data_py pandas scikit-learn
Ensemble Methods
Combine multiple models for better prediction accuracy.
Advanced
library(nflfastR)
library(tidyverse)
library(caret)
library(randomForest)

pbp <- load_pbp(2023)

# Prepare data
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
  mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE), labels = c("run", "pass"))) %>%
  select(is_pass, down, ydstogo, yardline_100,
         score_differential, game_seconds_remaining) %>%
  drop_na() %>%
  sample_n(min(5000, n()))

# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]

# Train individual models
# 1. Logistic Regression
glm_model <- glm(is_pass ~ ., data = train, family = "binomial")
glm_pred <- predict(glm_model, test, type = "response")

# 2. Random Forest
rf_model <- randomForest(is_pass ~ ., data = train, ntree = 100)
rf_pred <- predict(rf_model, test, type = "prob")[, "pass"]

# 3. Simple rule-based (pass rate by down)
rule_pred <- test %>%
  left_join(
    train %>% group_by(down) %>% summarize(base_rate = mean(is_pass == "pass")),
    by = "down"
  ) %>%
  pull(base_rate)

# Ensemble predictions (average)
ensemble_pred <- (glm_pred + rf_pred + rule_pred) / 3
ensemble_class <- ifelse(ensemble_pred > 0.5, "pass", "run")

# Evaluate each model
actual <- test$is_pass

results <- tibble(
  Model = c("Logistic", "Random Forest", "Rule-based", "Ensemble"),
  Accuracy = c(
    mean((glm_pred > 0.5) == (actual == "pass")),
    mean((rf_pred > 0.5) == (actual == "pass")),
    mean((rule_pred > 0.5) == (actual == "pass")),
    mean(ensemble_class == actual)
  )
)

print(results)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()
model_data = model_data.sample(n=min(5000, len(model_data)), random_state=42)

X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Individual models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# Train and evaluate individually
results = []
for name, model in [("Logistic", lr), ("Random Forest", rf), ("KNN", knn)]:
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results.append({"Model": name, "Accuracy": acc})

# Voting Ensemble
voting = VotingClassifier(
    estimators=[("lr", lr), ("rf", rf), ("knn", knn)],
    voting="soft"
)
voting.fit(X_train, y_train)
results.append({"Model": "Voting Ensemble", "Accuracy": accuracy_score(y_test, voting.predict(X_test))})

# Stacking Ensemble
stacking = StackingClassifier(
    estimators=[("lr", LogisticRegression(max_iter=1000)),
                ("rf", RandomForestClassifier(n_estimators=50))],
    final_estimator=LogisticRegression()
)
stacking.fit(X_train, y_train)
results.append({"Model": "Stacking Ensemble", "Accuracy": accuracy_score(y_test, stacking.predict(X_test))})

print("Model Comparison:")
print(pd.DataFrame(results).sort_values("Accuracy", ascending=False))
Packages: nflfastR tidyverse caret randomForest nfl_data_py pandas scikit-learn
Neural Network for Spread Prediction
Build a simple neural network for predicting game spreads.
Advanced
library(nflfastR)
library(tidyverse)
library(nnet)

schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)

# Calculate team features
team_features <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(season, posteam) %>%
  summarize(
    off_epa = mean(epa),
    pass_rate = mean(play_type == "pass"),
    .groups = "drop"
  ) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa)) %>%
      group_by(season, defteam) %>%
      summarize(def_epa = mean(epa), .groups = "drop"),
    by = c("season", "posteam" = "defteam")
  )

# Create game dataset
games <- schedules %>%
  filter(!is.na(result), !is.na(spread_line)) %>%
  left_join(team_features, by = c("season", "home_team" = "posteam")) %>%
  rename(home_off = off_epa, home_def = def_epa) %>%
  left_join(team_features %>% select(season, posteam, off_epa, def_epa),
            by = c("season", "away_team" = "posteam")) %>%
  rename(away_off = off_epa, away_def = def_epa) %>%
  mutate(
    spread_error = result + spread_line  # Positive = home beat spread
  ) %>%
  select(home_off, home_def, away_off, away_def, spread_error) %>%
  drop_na()

# Scale features
scaled <- scale(games %>% select(-spread_error))
games_scaled <- cbind(as.data.frame(scaled), spread_error = games$spread_error)

# Split data
set.seed(42)
train_idx <- sample(nrow(games_scaled), 0.8 * nrow(games_scaled))
train <- games_scaled[train_idx, ]
test <- games_scaled[-train_idx, ]

# Train neural network
nn_model <- nnet(
  spread_error ~ .,
  data = train,
  size = 10,  # Hidden layer size
  linout = TRUE,  # Linear output for regression
  maxit = 500,
  decay = 0.01
)

# Predict and evaluate
predictions <- predict(nn_model, test)
mae <- mean(abs(predictions - test$spread_error))
rmse <- sqrt(mean((predictions - test$spread_error)^2))

cat("MAE:", round(mae, 2), "\n")
cat("RMSE:", round(rmse, 2), "\n")
cat("Correct Side %:", round(mean((predictions > 0) == (test$spread_error > 0)) * 100, 1), "%\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])

# Calculate team features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

team_off = (plays.groupby(["season", "posteam"])
    .agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
    .reset_index())

team_def = (plays.groupby(["season", "defteam"])
    .agg(def_epa=("epa", "mean"))
    .reset_index())

team_features = team_off.merge(team_def, left_on=["season", "posteam"],
                                right_on=["season", "defteam"])

# Create game dataset
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
                    left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def"})
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
                    left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def"})

games["spread_error"] = games["result"] + games["spread_line"]
features = ["home_off", "home_def", "away_off", "away_def"]
games = games[features + ["spread_error"]].dropna()

# Scale and split
X = games[features]
y = games["spread_error"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train neural network
nn = MLPRegressor(hidden_layer_sizes=(16, 8), max_iter=500, random_state=42, early_stopping=True)
nn.fit(X_train, y_train)

# Evaluate
predictions = nn.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
correct_side = ((predictions > 0) == (y_test > 0)).mean()

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Correct Side %: {correct_side * 100:.1f}%")
Packages: nflfastR tidyverse nnet nfl_data_py pandas scikit-learn
Quick Package Reference
R Packages
  • nflfastR - Play-by-play data with EPA
  • nflplotR - NFL team logos & plotting
  • tidyverse - Data manipulation & visualization
  • ggplot2 - Advanced visualizations
Python Packages
  • nfl_data_py - NFL data (nflverse compatible)
  • pandas - Data manipulation
  • matplotlib - Visualizations
  • scikit-learn - Machine learning

Ready to Dive Deeper?

Learn the theory behind these techniques in our comprehensive tutorial series

Browse Tutorials