Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.
Apply ML techniques for predictions, clustering, and pattern recognition
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Prepare training data
model_data <- pbp %>%
filter(!is.na(score_differential), !is.na(wp)) %>%
select(
home_team, away_team, posteam,
game_seconds_remaining, score_differential,
down, ydstogo, yardline_100,
posteam_timeouts_remaining, defteam_timeouts_remaining,
receive_2h_ko,
wp # Target variable (use as proxy for outcome)
) %>%
mutate(
is_home = posteam == home_team,
qtr = 4 - (game_seconds_remaining %/% 900)
) %>%
drop_na()
# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]
# Fit logistic regression
model <- glm(
wp ~ score_differential + game_seconds_remaining +
is_home + down + ydstogo + yardline_100 +
posteam_timeouts_remaining,
data = train,
family = "quasibinomial"
)
# Evaluate
test$pred_wp <- predict(model, test, type = "response")
test$correct <- (test$pred_wp > 0.5) == (test$wp > 0.5)
cat("Accuracy:", mean(test$correct) * 100, "%\n")
cat("Brier Score:", mean((test$pred_wp - test$wp)^2), "\n")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, brier_score_loss
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
pbp["score_differential"].notna() &
pbp["wp"].notna()
][["home_team", "posteam", "game_seconds_remaining",
"score_differential", "down", "ydstogo", "yardline_100",
"posteam_timeouts_remaining", "wp"]].dropna()
model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)
# Features and target
features = ["score_differential", "game_seconds_remaining", "is_home",
"down", "ydstogo", "yardline_100", "posteam_timeouts_remaining"]
X = model_data[features]
y = (model_data["wp"] > 0.5).astype(int)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Fit model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Brier Score: {brier_score_loss(y_test, y_prob):.4f}")
nflfastR
tidyverse
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Create receiver profiles
receiver_profiles <- pbp %>%
filter(!is.na(receiver_player_id), play_type == "pass") %>%
group_by(receiver_player_id, receiver_player_name) %>%
summarize(
targets = n(),
catch_rate = mean(complete_pass),
avg_depth = mean(air_yards, na.rm = TRUE),
yac_per_rec = mean(yards_after_catch[complete_pass == 1], na.rm = TRUE),
contested_rate = mean(defenders_in_box >= 7, na.rm = TRUE),
.groups = "drop"
) %>%
filter(targets >= 40) %>%
drop_na()
# Scale features
features <- receiver_profiles %>%
select(catch_rate, avg_depth, yac_per_rec)
scaled_features <- scale(features)
# K-means clustering
set.seed(42)
kmeans_result <- kmeans(scaled_features, centers = 4, nstart = 25)
# Add clusters to data
receiver_profiles$cluster <- factor(kmeans_result$cluster)
# Analyze clusters
cluster_summary <- receiver_profiles %>%
group_by(cluster) %>%
summarize(
n_players = n(),
avg_catch_rate = mean(catch_rate),
avg_depth = mean(avg_depth),
avg_yac = mean(yac_per_rec)
)
print(cluster_summary)
# Name clusters based on characteristics
# Cluster 1: Deep threats (high depth)
# Cluster 2: Possession receivers (high catch rate)
# etc.
import nfl_data_py as nfl
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
pbp = nfl.import_pbp_data([2023])
# Create receiver profiles
pass_plays = pbp[(pbp["receiver_player_id"].notna()) &
(pbp["play_type"] == "pass")]
receiver_profiles = (pass_plays.groupby(
["receiver_player_id", "receiver_player_name"])
.agg(
targets=("play_id", "count"),
catch_rate=("complete_pass", "mean"),
avg_depth=("air_yards", "mean"),
yac_per_rec=("yards_after_catch", lambda x: x[pass_plays.loc[x.index, "complete_pass"] == 1].mean())
)
.reset_index()
.dropna())
receiver_profiles = receiver_profiles[receiver_profiles["targets"] >= 40]
# Scale features
features = receiver_profiles[["catch_rate", "avg_depth", "yac_per_rec"]]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42, n_init=25)
receiver_profiles["cluster"] = kmeans.fit_predict(scaled_features)
# Analyze clusters
cluster_summary = (receiver_profiles.groupby("cluster")
.agg(
n_players=("receiver_player_name", "count"),
avg_catch_rate=("catch_rate", "mean"),
avg_depth=("avg_depth", "mean"),
avg_yac=("yac_per_rec", "mean")
)
.reset_index())
print(cluster_summary)
nflfastR
tidyverse
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(randomForest)
pbp <- load_pbp(2023)
# Prepare data for play prediction
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
mutate(
is_pass = as.factor(play_type == "pass"),
is_home = posteam == home_team
) %>%
select(is_pass, down, ydstogo, yardline_100, score_differential,
game_seconds_remaining, is_home) %>%
drop_na()
# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]
# Train random forest
rf_model <- randomForest(
is_pass ~ .,
data = train,
ntree = 100,
importance = TRUE
)
# Evaluate
test$pred <- predict(rf_model, test)
accuracy <- mean(test$pred == test$is_pass)
cat("Accuracy:", accuracy * 100, "%\n")
# Feature importance
importance(rf_model) %>% as.data.frame() %>%
rownames_to_column("feature") %>%
arrange(desc(MeanDecreaseGini)) %>%
print()
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential",
"game_seconds_remaining", "is_home"]
model_data = model_data[features + ["is_pass"]].dropna()
# Split data
X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Evaluate
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
importance = pd.DataFrame({
"feature": features,
"importance": rf.feature_importances_
}).sort_values("importance", ascending=False)
print(importance)
nflfastR
tidyverse
randomForest
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(xgboost)
schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)
# Calculate team metrics per season
team_metrics <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(season, posteam) %>%
summarize(
off_epa = mean(epa),
pass_rate = mean(play_type == "pass"),
.groups = "drop"
) %>%
left_join(
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(season, defteam) %>%
summarize(def_epa = mean(epa), .groups = "drop"),
by = c("season", "posteam" = "defteam")
)
# Create game features
games <- schedules %>%
filter(!is.na(result)) %>%
left_join(team_metrics, by = c("season", "home_team" = "posteam")) %>%
rename(home_off = off_epa, home_def = def_epa, home_pass = pass_rate) %>%
left_join(team_metrics, by = c("season", "away_team" = "posteam")) %>%
rename(away_off = off_epa, away_def = def_epa, away_pass = pass_rate) %>%
mutate(home_win = as.numeric(result > 0)) %>%
select(home_off, home_def, home_pass, away_off, away_def, away_pass, home_win) %>%
drop_na()
# Prepare for XGBoost
train_matrix <- as.matrix(games %>% select(-home_win))
labels <- games$home_win
# Train XGBoost
xgb_model <- xgboost(
data = train_matrix,
label = labels,
nrounds = 100,
objective = "binary:logistic",
verbose = 0
)
# Feature importance
importance <- xgb.importance(model = xgb_model)
print(importance)
import nfl_data_py as nfl
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])
# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
team_off = (plays.groupby(["season", "posteam"])
.agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
.reset_index())
team_def = (plays.groupby(["season", "defteam"])
.agg(def_epa=("epa", "mean"))
.reset_index())
team_metrics = team_off.merge(team_def, left_on=["season", "posteam"],
right_on=["season", "defteam"])
# Create game features
games = schedules[schedules["result"].notna()].copy()
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def", "pass_rate": "home_pass"})
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def", "pass_rate": "away_pass"})
games["home_win"] = (games["result"] > 0).astype(int)
features = ["home_off", "home_def", "home_pass", "away_off", "away_def", "away_pass"]
games = games[features + ["home_win"]].dropna()
# Train XGBoost
X_train, X_test, y_train, y_test = train_test_split(
games[features], games["home_win"], test_size=0.2, random_state=42)
model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
print(pd.DataFrame({"feature": features, "importance": model.feature_importances_}).sort_values("importance", ascending=False))
nflfastR
tidyverse
xgboost
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Create advanced features
features <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
mutate(
# Situational features
is_neutral_game_script = abs(score_differential) <= 7,
is_garbage_time = (qtr == 4 & abs(score_differential) > 21) |
(qtr >= 4 & game_seconds_remaining < 120 & abs(score_differential) > 14),
is_two_minute = game_seconds_remaining <= 120 & qtr %in% c(2, 4),
is_redzone = yardline_100 <= 20,
is_goal_to_go = goal_to_go == 1,
is_third_or_fourth = down >= 3,
# Distance buckets
distance_bucket = case_when(
ydstogo <= 3 ~ "short",
ydstogo <= 7 ~ "medium",
ydstogo <= 10 ~ "long",
TRUE ~ "very_long"
),
# Field position buckets
field_bucket = case_when(
yardline_100 >= 80 ~ "own_deep",
yardline_100 >= 50 ~ "own_side",
yardline_100 >= 20 ~ "opponent",
TRUE ~ "redzone"
),
# Time features
game_pct_complete = 1 - (game_seconds_remaining / 3600),
half = if_else(qtr <= 2, 1, 2),
# Rolling metrics (would need window function)
pass_heavy = NA # Placeholder for rolling pass rate
)
# Show feature distribution
features %>%
group_by(distance_bucket, field_bucket) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass"),
avg_epa = mean(epa),
.groups = "drop"
) %>%
print()
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Create advanced features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))].copy()
# Situational features
plays["is_neutral"] = plays["score_differential"].abs() <= 7
plays["is_garbage_time"] = (
((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21)) |
((plays["qtr"] >= 4) & (plays["game_seconds_remaining"] < 120) &
(plays["score_differential"].abs() > 14))
)
plays["is_two_minute"] = (plays["game_seconds_remaining"] <= 120) & (plays["qtr"].isin([2, 4]))
plays["is_redzone"] = plays["yardline_100"] <= 20
plays["is_third_or_fourth"] = plays["down"] >= 3
# Distance buckets
def distance_bucket(yd):
if yd <= 3: return "short"
elif yd <= 7: return "medium"
elif yd <= 10: return "long"
else: return "very_long"
plays["distance_bucket"] = plays["ydstogo"].apply(distance_bucket)
# Field position buckets
def field_bucket(yd):
if yd >= 80: return "own_deep"
elif yd >= 50: return "own_side"
elif yd >= 20: return "opponent"
else: return "redzone"
plays["field_bucket"] = plays["yardline_100"].apply(field_bucket)
# Time features
plays["game_pct_complete"] = 1 - (plays["game_seconds_remaining"] / 3600)
plays["half"] = np.where(plays["qtr"] <= 2, 1, 2)
# Show feature distribution
feature_dist = (plays.groupby(["distance_bucket", "field_bucket"])
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean()),
avg_epa=("epa", "mean")
)
.reset_index())
print(feature_dist)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
library(caret)
pbp <- load_pbp(2020:2023)
# Prepare data
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down), !is.na(epa)) %>%
mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE))) %>%
select(season, week, is_pass, down, ydstogo, yardline_100,
score_differential, game_seconds_remaining) %>%
drop_na()
# Time-series cross-validation (train on past, test on future)
results <- list()
for (test_year in 2021:2023) {
train <- model_data %>% filter(season < test_year)
test <- model_data %>% filter(season == test_year)
model <- glm(is_pass ~ down + ydstogo + yardline_100 +
score_differential + game_seconds_remaining,
data = train, family = "binomial")
preds <- predict(model, test, type = "response")
pred_class <- ifelse(preds > 0.5, TRUE, FALSE)
accuracy <- mean(pred_class == test$is_pass)
results[[as.character(test_year)]] <- accuracy
cat("Test Year:", test_year, "- Accuracy:", round(accuracy * 100, 1), "%\n")
}
# Average performance
cat("\nAverage Accuracy:", round(mean(unlist(results)) * 100, 1), "%\n")
# Week-by-week validation (within season)
# This accounts for within-season learning
week_results <- model_data %>%
filter(season == 2023) %>%
group_by(week) %>%
group_split() %>%
map_dbl(function(test_week) {
w <- unique(test_week$week)
train <- model_data %>% filter(season == 2023, week < w)
if (nrow(train) < 100) return(NA)
model <- glm(is_pass ~ ., data = train %>% select(-season, -week),
family = "binomial")
preds <- predict(model, test_week, type = "response") > 0.5
mean(preds == test_week$is_pass)
})
cat("\nWeek-by-Week Validation (2023):")
print(week_results, na.print = "N/A")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna()) &
(pbp["epa"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[["season", "week"] + features + ["is_pass"]].dropna()
# Time-series cross-validation (train on past, test on future)
results = []
for test_year in [2021, 2022, 2023]:
train = model_data[model_data["season"] < test_year]
test = model_data[model_data["season"] == test_year]
X_train, y_train = train[features], train["is_pass"]
X_test, y_test = test[features], test["is_pass"]
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results.append(accuracy)
print(f"Test Year: {test_year} - Accuracy: {accuracy * 100:.1f}%")
print(f"\nAverage Accuracy: {sum(results) / len(results) * 100:.1f}%")
# Alternative: sklearn TimeSeriesSplit
print("\nTimeSeriesSplit (3-fold):")
tscv = TimeSeriesSplit(n_splits=3)
X = model_data[features]
y = model_data["is_pass"]
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(f"Fold {i+1}: {acc * 100:.1f}%")
nflfastR
tidyverse
caret
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(pROC)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Build a simple win probability prediction
model_data <- pbp %>%
filter(!is.na(wp), !is.na(score_differential)) %>%
select(game_id, play_id, wp, score_differential,
game_seconds_remaining, posteam) %>%
drop_na()
# Create actual outcome
game_outcomes <- schedules %>%
filter(!is.na(result)) %>%
mutate(home_win = result > 0) %>%
select(game_id, home_team, home_win)
model_data <- model_data %>%
left_join(game_outcomes, by = "game_id") %>%
mutate(actual_win = (posteam == home_team & home_win) |
(posteam != home_team & !home_win))
# Simple model based on score differential and time
model <- glm(actual_win ~ score_differential + game_seconds_remaining,
data = model_data, family = "binomial")
model_data$pred_wp <- predict(model, type = "response")
# Evaluation metrics
# 1. Brier Score (lower is better)
brier <- mean((model_data$pred_wp - model_data$actual_win)^2)
cat("Brier Score:", round(brier, 4), "\n")
# 2. Log Loss
log_loss <- -mean(model_data$actual_win * log(model_data$pred_wp + 1e-10) +
(1 - model_data$actual_win) * log(1 - model_data$pred_wp + 1e-10))
cat("Log Loss:", round(log_loss, 4), "\n")
# 3. AUC-ROC
roc_obj <- roc(model_data$actual_win, model_data$pred_wp, quiet = TRUE)
cat("AUC-ROC:", round(auc(roc_obj), 4), "\n")
# 4. Calibration by probability bucket
calibration <- model_data %>%
mutate(wp_bucket = cut(pred_wp, breaks = seq(0, 1, 0.1))) %>%
group_by(wp_bucket) %>%
summarize(
n = n(),
predicted = mean(pred_wp),
actual = mean(actual_win)
)
print("\nCalibration by Probability Bucket:")
print(calibration)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (brier_score_loss, log_loss, roc_auc_score,
accuracy_score, confusion_matrix)
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Prepare data
model_data = pbp[
(pbp["wp"].notna()) &
(pbp["score_differential"].notna())
][["game_id", "wp", "score_differential", "game_seconds_remaining", "posteam"]].dropna()
# Get actual outcomes
outcomes = schedules[schedules["result"].notna()][["game_id", "home_team", "result"]].copy()
outcomes["home_win"] = outcomes["result"] > 0
model_data = model_data.merge(outcomes, on="game_id")
model_data["actual_win"] = (
((model_data["posteam"] == model_data["home_team"]) & model_data["home_win"]) |
((model_data["posteam"] != model_data["home_team"]) & ~model_data["home_win"])
).astype(int)
# Simple model
X = model_data[["score_differential", "game_seconds_remaining"]]
y = model_data["actual_win"]
model = LogisticRegression(max_iter=1000)
model.fit(X, y)
pred_wp = model.predict_proba(X)[:, 1]
# Evaluation metrics
print("Model Evaluation Metrics:")
print(f"Brier Score: {brier_score_loss(y, pred_wp):.4f}")
print(f"Log Loss: {log_loss(y, pred_wp):.4f}")
print(f"AUC-ROC: {roc_auc_score(y, pred_wp):.4f}")
print(f"Accuracy: {accuracy_score(y, (pred_wp > 0.5).astype(int)) * 100:.1f}%")
# Calibration
model_data["pred_wp"] = pred_wp
model_data["wp_bucket"] = pd.cut(pred_wp, bins=np.arange(0, 1.1, 0.1))
calibration = (model_data.groupby("wp_bucket")
.agg(n=("actual_win", "count"),
predicted=("pred_wp", "mean"),
actual=("actual_win", "mean"))
.reset_index())
print("\nCalibration by Probability Bucket:")
print(calibration)
nflfastR
tidyverse
pROC
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(caret)
pbp <- load_pbp(2023)
# Prepare data
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
mutate(is_pass = factor(play_type == "pass")) %>%
select(is_pass, down, ydstogo, yardline_100,
score_differential, game_seconds_remaining) %>%
drop_na() %>%
sample_n(min(10000, n())) # Subsample for speed
# Define hyperparameter grid for Random Forest
rf_grid <- expand.grid(
mtry = c(2, 3, 4, 5)
)
# Cross-validation control
train_control <- trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Train with grid search
set.seed(42)
rf_tuned <- train(
is_pass ~ .,
data = model_data,
method = "rf",
trControl = train_control,
tuneGrid = rf_grid,
metric = "ROC",
ntree = 50
)
# Results
print(rf_tuned)
print(rf_tuned$bestTune)
# Plot tuning results
plot(rf_tuned)
# Best model performance
confusionMatrix(rf_tuned)
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()
# Subsample for speed
model_data = model_data.sample(n=min(10000, len(model_data)), random_state=42)
X = model_data[features]
y = model_data["is_pass"]
# Define hyperparameter grid
param_grid = {
"n_estimators": [50, 100, 200],
"max_depth": [5, 10, 15, None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
# Smaller grid for faster demo
small_grid = {
"n_estimators": [50, 100],
"max_depth": [5, 10],
"min_samples_split": [2, 5]
}
# Grid search with cross-validation
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
rf,
small_grid,
cv=5,
scoring="roc_auc",
n_jobs=-1,
verbose=1
)
grid_search.fit(X, y)
# Results
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score (AUC): {grid_search.best_score_:.4f}")
# All results
results = pd.DataFrame(grid_search.cv_results_)
print("\nTop 5 Parameter Combinations:")
print(results.nsmallest(5, "rank_test_score")[["params", "mean_test_score", "std_test_score"]])
nflfastR
tidyverse
caret
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(caret)
library(randomForest)
pbp <- load_pbp(2023)
# Prepare data
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE), labels = c("run", "pass"))) %>%
select(is_pass, down, ydstogo, yardline_100,
score_differential, game_seconds_remaining) %>%
drop_na() %>%
sample_n(min(5000, n()))
# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]
# Train individual models
# 1. Logistic Regression
glm_model <- glm(is_pass ~ ., data = train, family = "binomial")
glm_pred <- predict(glm_model, test, type = "response")
# 2. Random Forest
rf_model <- randomForest(is_pass ~ ., data = train, ntree = 100)
rf_pred <- predict(rf_model, test, type = "prob")[, "pass"]
# 3. Simple rule-based (pass rate by down)
rule_pred <- test %>%
left_join(
train %>% group_by(down) %>% summarize(base_rate = mean(is_pass == "pass")),
by = "down"
) %>%
pull(base_rate)
# Ensemble predictions (average)
ensemble_pred <- (glm_pred + rf_pred + rule_pred) / 3
ensemble_class <- ifelse(ensemble_pred > 0.5, "pass", "run")
# Evaluate each model
actual <- test$is_pass
results <- tibble(
Model = c("Logistic", "Random Forest", "Rule-based", "Ensemble"),
Accuracy = c(
mean((glm_pred > 0.5) == (actual == "pass")),
mean((rf_pred > 0.5) == (actual == "pass")),
mean((rule_pred > 0.5) == (actual == "pass")),
mean(ensemble_class == actual)
)
)
print(results)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()
model_data = model_data.sample(n=min(5000, len(model_data)), random_state=42)
X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Individual models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
# Train and evaluate individually
results = []
for name, model in [("Logistic", lr), ("Random Forest", rf), ("KNN", knn)]:
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
results.append({"Model": name, "Accuracy": acc})
# Voting Ensemble
voting = VotingClassifier(
estimators=[("lr", lr), ("rf", rf), ("knn", knn)],
voting="soft"
)
voting.fit(X_train, y_train)
results.append({"Model": "Voting Ensemble", "Accuracy": accuracy_score(y_test, voting.predict(X_test))})
# Stacking Ensemble
stacking = StackingClassifier(
estimators=[("lr", LogisticRegression(max_iter=1000)),
("rf", RandomForestClassifier(n_estimators=50))],
final_estimator=LogisticRegression()
)
stacking.fit(X_train, y_train)
results.append({"Model": "Stacking Ensemble", "Accuracy": accuracy_score(y_test, stacking.predict(X_test))})
print("Model Comparison:")
print(pd.DataFrame(results).sort_values("Accuracy", ascending=False))
nflfastR
tidyverse
caret
randomForest
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(nnet)
schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)
# Calculate team features
team_features <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(season, posteam) %>%
summarize(
off_epa = mean(epa),
pass_rate = mean(play_type == "pass"),
.groups = "drop"
) %>%
left_join(
pbp %>%
filter(!is.na(epa)) %>%
group_by(season, defteam) %>%
summarize(def_epa = mean(epa), .groups = "drop"),
by = c("season", "posteam" = "defteam")
)
# Create game dataset
games <- schedules %>%
filter(!is.na(result), !is.na(spread_line)) %>%
left_join(team_features, by = c("season", "home_team" = "posteam")) %>%
rename(home_off = off_epa, home_def = def_epa) %>%
left_join(team_features %>% select(season, posteam, off_epa, def_epa),
by = c("season", "away_team" = "posteam")) %>%
rename(away_off = off_epa, away_def = def_epa) %>%
mutate(
spread_error = result + spread_line # Positive = home beat spread
) %>%
select(home_off, home_def, away_off, away_def, spread_error) %>%
drop_na()
# Scale features
scaled <- scale(games %>% select(-spread_error))
games_scaled <- cbind(as.data.frame(scaled), spread_error = games$spread_error)
# Split data
set.seed(42)
train_idx <- sample(nrow(games_scaled), 0.8 * nrow(games_scaled))
train <- games_scaled[train_idx, ]
test <- games_scaled[-train_idx, ]
# Train neural network
nn_model <- nnet(
spread_error ~ .,
data = train,
size = 10, # Hidden layer size
linout = TRUE, # Linear output for regression
maxit = 500,
decay = 0.01
)
# Predict and evaluate
predictions <- predict(nn_model, test)
mae <- mean(abs(predictions - test$spread_error))
rmse <- sqrt(mean((predictions - test$spread_error)^2))
cat("MAE:", round(mae, 2), "\n")
cat("RMSE:", round(rmse, 2), "\n")
cat("Correct Side %:", round(mean((predictions > 0) == (test$spread_error > 0)) * 100, 1), "%\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])
# Calculate team features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
team_off = (plays.groupby(["season", "posteam"])
.agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
.reset_index())
team_def = (plays.groupby(["season", "defteam"])
.agg(def_epa=("epa", "mean"))
.reset_index())
team_features = team_off.merge(team_def, left_on=["season", "posteam"],
right_on=["season", "defteam"])
# Create game dataset
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def"})
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def"})
games["spread_error"] = games["result"] + games["spread_line"]
features = ["home_off", "home_def", "away_off", "away_def"]
games = games[features + ["spread_error"]].dropna()
# Scale and split
X = games[features]
y = games["spread_error"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train neural network
nn = MLPRegressor(hidden_layer_sizes=(16, 8), max_iter=500, random_state=42, early_stopping=True)
nn.fit(X_train, y_train)
# Evaluate
predictions = nn.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
correct_side = ((predictions > 0) == (y_test > 0)).mean()
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Correct Side %: {correct_side * 100:.1f}%")
nflfastR
tidyverse
nnet
nfl_data_py
pandas
scikit-learn
nflfastR - Play-by-play data with EPAnflplotR - NFL team logos & plottingtidyverse - Data manipulation & visualizationggplot2 - Advanced visualizationsnfl_data_py - NFL data (nflverse compatible)pandas - Data manipulationmatplotlib - Visualizationsscikit-learn - Machine learningLearn the theory behind these techniques in our comprehensive tutorial series
Browse Tutorials