Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.
Analyze win probability, WPA, and clutch performance metrics
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Win probability is already calculated in pbp data
# Key columns: wp, vegas_wp, home_wp, away_wp
# View WP at key moments
key_plays <- pbp %>%
filter(!is.na(wp), !is.na(desc)) %>%
select(game_id, qtr, time, posteam, desc, wp, wpa) %>%
head(20)
print(key_plays)
# Average WP by score differential
wp_by_score <- pbp %>%
filter(!is.na(wp), !is.na(score_differential)) %>%
group_by(score_differential) %>%
summarize(
avg_wp = mean(wp),
plays = n()
) %>%
filter(plays >= 100, abs(score_differential) <= 21)
print(wp_by_score)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Win probability columns: wp, vegas_wp, home_wp, away_wp
# View key WP columns
wp_cols = ["game_id", "qtr", "time", "posteam", "wp", "wpa"]
key_plays = pbp[pbp["wp"].notna()][wp_cols].head(20)
print("Sample Win Probability Data:")
print(key_plays)
# Average WP by score differential
wp_by_score = (pbp[pbp["wp"].notna() & pbp["score_differential"].notna()]
.groupby("score_differential")
.agg(avg_wp=("wp", "mean"), plays=("wp", "count"))
.reset_index())
wp_by_score = wp_by_score[
(wp_by_score["plays"] >= 100) &
(wp_by_score["score_differential"].abs() <= 21)
]
print("\nAverage WP by Score Differential:")
print(wp_by_score)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# QB WPA leaders
qb_wpa <- pbp %>%
filter(!is.na(wpa), !is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
dropbacks = n(),
total_wpa = sum(wpa),
positive_plays = sum(wpa > 0),
negative_plays = sum(wpa < 0),
.groups = "drop"
) %>%
filter(dropbacks >= 200) %>%
arrange(desc(total_wpa))
print(qb_wpa)
# Single biggest WPA plays of the season
pbp %>%
filter(!is.na(wpa)) %>%
arrange(desc(abs(wpa))) %>%
select(game_id, posteam, qtr, desc, wpa) %>%
head(10)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# QB WPA leaders
qb_plays = pbp[pbp["wpa"].notna() & pbp["passer_player_id"].notna()]
qb_wpa = (qb_plays.groupby(["passer_player_id", "passer_player_name"])
.agg(
dropbacks=("wpa", "count"),
total_wpa=("wpa", "sum"),
positive_plays=("wpa", lambda x: (x > 0).sum()),
negative_plays=("wpa", lambda x: (x < 0).sum())
)
.reset_index())
qb_wpa = qb_wpa[qb_wpa["dropbacks"] >= 200].sort_values("total_wpa", ascending=False)
print("QB WPA Leaders:")
print(qb_wpa)
# Biggest WPA plays
biggest_plays = (pbp[pbp["wpa"].notna()]
.assign(abs_wpa=lambda x: x["wpa"].abs())
.nlargest(10, "abs_wpa")
[["game_id", "posteam", "qtr", "desc", "wpa"]])
print("\nBiggest WPA Plays:")
print(biggest_plays)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Define clutch situations: 4th quarter, WP between 25-75%
clutch_plays <- pbp %>%
filter(
!is.na(epa),
qtr == 4,
wp >= 0.25 & wp <= 0.75
)
# QB clutch performance
qb_clutch <- clutch_plays %>%
filter(!is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
clutch_plays = n(),
clutch_epa = sum(epa),
clutch_wpa = sum(wpa, na.rm = TRUE),
clutch_success = mean(success),
.groups = "drop"
) %>%
filter(clutch_plays >= 30) %>%
arrange(desc(clutch_wpa))
print(qb_clutch)
# Compare clutch vs non-clutch
pbp %>%
filter(!is.na(passer_player_id), !is.na(epa)) %>%
mutate(
is_clutch = qtr == 4 & wp >= 0.25 & wp <= 0.75
) %>%
group_by(passer_player_name, is_clutch) %>%
summarize(epa_per_play = mean(epa), .groups = "drop") %>%
pivot_wider(names_from = is_clutch, values_from = epa_per_play)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Define clutch situations
clutch_plays = pbp[
(pbp["epa"].notna()) &
(pbp["qtr"] == 4) &
(pbp["wp"] >= 0.25) &
(pbp["wp"] <= 0.75)
]
# QB clutch performance
qb_clutch = (clutch_plays[clutch_plays["passer_player_id"].notna()]
.groupby(["passer_player_id", "passer_player_name"])
.agg(
clutch_plays=("epa", "count"),
clutch_epa=("epa", "sum"),
clutch_wpa=("wpa", "sum"),
clutch_success=("success", "mean")
)
.reset_index())
qb_clutch = qb_clutch[qb_clutch["clutch_plays"] >= 30].sort_values(
"clutch_wpa", ascending=False)
print("QB Clutch Performance (4Q, WP 25-75%):")
print(qb_clutch)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Find biggest swing plays (largest WPA changes)
swing_plays <- pbp %>%
filter(!is.na(wpa)) %>%
mutate(abs_wpa = abs(wpa)) %>%
arrange(desc(abs_wpa)) %>%
select(game_id, week, posteam, qtr, time, down, ydstogo,
desc, wpa, wp) %>%
head(50)
print(swing_plays)
# Game-changing interceptions
ints <- pbp %>%
filter(interception == 1, !is.na(wpa)) %>%
arrange(desc(abs(wpa))) %>%
select(game_id, passer_player_name, interception_player_name,
wpa, wp, desc) %>%
head(10)
print(ints)
# Game-changing touchdowns
tds <- pbp %>%
filter(touchdown == 1, !is.na(wpa)) %>%
arrange(desc(wpa)) %>%
select(game_id, posteam, qtr, wpa, wp, desc) %>%
head(10)
print(tds)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Find biggest swing plays
swing_plays = pbp[pbp["wpa"].notna()].copy()
swing_plays["abs_wpa"] = swing_plays["wpa"].abs()
swing_plays = swing_plays.nlargest(50, "abs_wpa")[
["game_id", "week", "posteam", "qtr", "time", "down",
"ydstogo", "desc", "wpa", "wp"]
]
print("Biggest Swing Plays:")
print(swing_plays.head(20))
# Game-changing interceptions
ints = (pbp[(pbp["interception"] == 1) & (pbp["wpa"].notna())]
.assign(abs_wpa=lambda x: x["wpa"].abs())
.nlargest(10, "abs_wpa")
[["game_id", "passer_player_name", "interception_player_name", "wpa", "wp"]])
print("\nBiggest Interceptions by WPA:")
print(ints)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Find games with 4th quarter comebacks
comebacks <- pbp %>%
filter(qtr == 4) %>%
group_by(game_id, home_team, away_team) %>%
summarize(
start_wp_home = first(home_wp),
end_wp_home = last(home_wp),
home_score = last(total_home_score),
away_score = last(total_away_score),
.groups = "drop"
) %>%
mutate(
home_won = home_score > away_score,
comeback = (start_wp_home < 0.25 & home_won) |
(start_wp_home > 0.75 & !home_won),
wp_swing = abs(end_wp_home - start_wp_home)
) %>%
filter(comeback) %>%
arrange(desc(wp_swing))
print(comebacks)
# Teams with most comebacks
comeback_counts <- comebacks %>%
mutate(
comeback_team = if_else(home_won, home_team, away_team)
) %>%
count(comeback_team, sort = TRUE)
print(comeback_counts)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter to 4th quarter plays
q4 = pbp[pbp["qtr"] == 4].copy()
# Get start and end WP for each game
game_wp = (q4.groupby(["game_id", "home_team", "away_team"])
.agg(
start_wp_home=("home_wp", "first"),
end_wp_home=("home_wp", "last"),
home_score=("total_home_score", "last"),
away_score=("total_away_score", "last")
)
.reset_index())
game_wp["home_won"] = game_wp["home_score"] > game_wp["away_score"]
game_wp["comeback"] = (
((game_wp["start_wp_home"] < 0.25) & game_wp["home_won"]) |
((game_wp["start_wp_home"] > 0.75) & ~game_wp["home_won"])
)
game_wp["wp_swing"] = (game_wp["end_wp_home"] - game_wp["start_wp_home"]).abs()
comebacks = game_wp[game_wp["comeback"]].sort_values("wp_swing", ascending=False)
print("4th Quarter Comebacks:")
print(comebacks)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Select a specific game (e.g., Super Bowl or exciting game)
game <- pbp %>%
filter(week == 1, home_team == "KC") %>%
filter(!is.na(wp))
# Create WP chart data
wp_chart <- game %>%
select(game_seconds_remaining, wp, posteam, home_team, away_team) %>%
mutate(
game_time = 3600 - game_seconds_remaining,
home_wp = if_else(posteam == home_team, wp, 1 - wp)
)
# Plot
ggplot(wp_chart, aes(x = game_time, y = home_wp)) +
geom_line(color = "#E31837", size = 1) +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray50") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(
breaks = c(0, 900, 1800, 2700, 3600),
labels = c("Q1", "Q2", "Q3", "Q4", "End")
) +
labs(
title = paste(unique(game$away_team), "@", unique(game$home_team)),
x = "Game Time",
y = "Home Win Probability"
) +
theme_minimal()
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
pbp = nfl.import_pbp_data([2023])
# Select a specific game
game = pbp[(pbp["week"] == 1) & (pbp["home_team"] == "KC")]
game = game[game["wp"].notna()].copy()
# Calculate game time and home WP
game["game_time"] = 3600 - game["game_seconds_remaining"]
game["home_wp"] = game.apply(
lambda x: x["wp"] if x["posteam"] == x["home_team"] else 1 - x["wp"],
axis=1
)
# Create plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(game["game_time"], game["home_wp"], color="#E31837", linewidth=2)
ax.axhline(y=0.5, color="gray", linestyle="--", alpha=0.5)
ax.set_xlim(0, 3600)
ax.set_ylim(0, 1)
ax.set_xticks([0, 900, 1800, 2700, 3600])
ax.set_xticklabels(["Q1", "Q2", "Q3", "Q4", "End"])
ax.set_ylabel("Home Win Probability")
ax.set_title(f"{game['away_team'].iloc[0]} @ {game['home_team'].iloc[0]}")
plt.tight_layout()
plt.savefig("wp_chart.png", dpi=300)
plt.show()
nflfastR
tidyverse
ggplot2
nfl_data_py
pandas
matplotlib
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Correlation between EPA and WPA
plays <- pbp %>%
filter(!is.na(epa), !is.na(wpa))
cat("Correlation EPA vs WPA:", cor(plays$epa, plays$wpa), "\n")
# EPA vs WPA by game situation
situation_comparison <- plays %>%
mutate(
situation = case_when(
qtr <= 2 ~ "First Half",
wp > 0.75 | wp < 0.25 ~ "Blowout",
TRUE ~ "Competitive"
)
) %>%
group_by(situation) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
avg_wpa = mean(wpa),
epa_wpa_cor = cor(epa, wpa)
)
print(situation_comparison)
# High EPA but low WPA (garbage time) vs low EPA high WPA (clutch)
plays %>%
mutate(
epa_quartile = ntile(epa, 4),
wpa_quartile = ntile(wpa, 4)
) %>%
count(epa_quartile, wpa_quartile) %>%
pivot_wider(names_from = wpa_quartile, values_from = n)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Filter plays with both EPA and WPA
plays = pbp[(pbp["epa"].notna()) & (pbp["wpa"].notna())]
# Overall correlation
correlation = plays["epa"].corr(plays["wpa"])
print(f"EPA vs WPA Correlation: {correlation:.3f}")
# By game situation
def get_situation(row):
if row["qtr"] <= 2:
return "First Half"
elif row["wp"] > 0.75 or row["wp"] < 0.25:
return "Blowout"
else:
return "Competitive"
plays["situation"] = plays.apply(get_situation, axis=1)
situation_comparison = (plays.groupby("situation")
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
avg_wpa=("wpa", "mean")
)
.reset_index())
print("\nEPA vs WPA by Situation:")
print(situation_comparison)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Fourth down decisions in close 4th quarter games
late_4th_downs <- pbp %>%
filter(
down == 4,
qtr == 4,
wp >= 0.20 & wp <= 0.80,
!is.na(fourth_down_decision)
)
# Decision distribution
late_4th_downs %>%
count(fourth_down_decision)
# WP impact by decision
decision_impact <- late_4th_downs %>%
group_by(fourth_down_decision) %>%
summarize(
attempts = n(),
avg_wpa = mean(wpa, na.rm = TRUE),
success_rate = mean(fourth_down_converted, na.rm = TRUE),
avg_wp_before = mean(wp)
)
print(decision_impact)
# Aggressive vs conservative coaches
coach_decisions <- late_4th_downs %>%
group_by(posteam) %>%
summarize(
fourth_downs = n(),
go_for_it_rate = mean(fourth_down_decision == "go"),
punt_rate = mean(fourth_down_decision == "punt"),
fg_rate = mean(fourth_down_decision == "field_goal")
) %>%
arrange(desc(go_for_it_rate))
print(coach_decisions)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Fourth down decisions in close 4th quarter games
late_4th = pbp[
(pbp["down"] == 4) &
(pbp["qtr"] == 4) &
(pbp["wp"] >= 0.20) &
(pbp["wp"] <= 0.80) &
(pbp["fourth_down_decision"].notna())
]
# Decision distribution
print("4th Down Decision Distribution:")
print(late_4th["fourth_down_decision"].value_counts())
# WP impact by decision
decision_impact = (late_4th.groupby("fourth_down_decision")
.agg(
attempts=("wpa", "count"),
avg_wpa=("wpa", "mean"),
avg_wp_before=("wp", "mean")
)
.reset_index())
print("\nDecision Impact:")
print(decision_impact)
# Team aggressiveness
team_decisions = (late_4th.groupby("posteam")
.agg(fourth_downs=("down", "count"))
.reset_index())
go_counts = late_4th[late_4th["fourth_down_decision"] == "go"].groupby("posteam").size()
team_decisions = team_decisions.merge(
go_counts.reset_index(name="go_for_it"),
on="posteam", how="left"
).fillna(0)
team_decisions["go_rate"] = team_decisions["go_for_it"] / team_decisions["fourth_downs"]
print("\nTeam Aggressiveness:")
print(team_decisions.sort_values("go_rate", ascending=False))
nflfastR
tidyverse
nfl_data_py
pandas
nflfastR - Play-by-play data with EPAnflplotR - NFL team logos & plottingtidyverse - Data manipulation & visualizationggplot2 - Advanced visualizationsnfl_data_py - NFL data (nflverse compatible)pandas - Data manipulationmatplotlib - Visualizationsscikit-learn - Machine learningLearn the theory behind these techniques in our comprehensive tutorial series
Browse Tutorials