Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.
Calculate and analyze Expected Points Added for teams, players, and situations
library(nflfastR)
library(tidyverse)
# Load data
pbp <- load_pbp(2023)
# Filter to pass and rush plays
plays <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run"))
# Calculate offensive EPA/play
offense_epa <- plays %>%
group_by(posteam) %>%
summarize(
plays = n(),
total_epa = sum(epa),
epa_per_play = mean(epa),
pass_epa = mean(epa[play_type == "pass"]),
rush_epa = mean(epa[play_type == "run"]),
success_rate = mean(success)
) %>%
arrange(desc(epa_per_play))
# Calculate defensive EPA/play (lower is better)
defense_epa <- plays %>%
group_by(defteam) %>%
summarize(
plays = n(),
epa_allowed = mean(epa),
pass_epa_allowed = mean(epa[play_type == "pass"]),
rush_epa_allowed = mean(epa[play_type == "run"])
) %>%
arrange(epa_allowed)
print(offense_epa)
print(defense_epa)
import nfl_data_py as nfl
import pandas as pd
# Load data
pbp = nfl.import_pbp_data([2023])
# Filter to pass and rush plays
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))]
# Calculate offensive EPA/play
offense_epa = (plays.groupby("posteam")
.agg(
plays=("epa", "count"),
total_epa=("epa", "sum"),
epa_per_play=("epa", "mean"),
success_rate=("success", "mean")
)
.sort_values("epa_per_play", ascending=False)
.reset_index())
# Calculate defensive EPA/play
defense_epa = (plays.groupby("defteam")
.agg(epa_allowed=("epa", "mean"))
.sort_values("epa_allowed")
.reset_index())
print("Offensive EPA/Play:")
print(offense_epa.head(10))
print("\nDefensive EPA/Play:")
print(defense_epa.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# QB EPA analysis
qb_epa <- pbp %>%
filter(!is.na(epa), !is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
dropbacks = n(),
total_epa = sum(epa),
epa_per_dropback = mean(epa),
cpoe = mean(cpoe, na.rm = TRUE),
success_rate = mean(success),
avg_air_yards = mean(air_yards, na.rm = TRUE),
.groups = "drop"
) %>%
filter(dropbacks >= 200) %>%
arrange(desc(epa_per_dropback))
# Add rank
qb_epa <- qb_epa %>%
mutate(rank = row_number())
print(qb_epa, n = 20)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter to pass plays with QB identified
pass_plays = pbp[(pbp["epa"].notna()) &
(pbp["passer_player_id"].notna())]
# QB EPA analysis
qb_epa = (pass_plays.groupby(["passer_player_id", "passer_player_name"])
.agg(
dropbacks=("epa", "count"),
total_epa=("epa", "sum"),
epa_per_dropback=("epa", "mean"),
cpoe=("cpoe", "mean"),
success_rate=("success", "mean"),
avg_air_yards=("air_yards", "mean")
)
.reset_index())
# Filter min dropbacks and rank
qb_epa = (qb_epa[qb_epa["dropbacks"] >= 200]
.sort_values("epa_per_dropback", ascending=False)
.reset_index(drop=True))
qb_epa["rank"] = range(1, len(qb_epa) + 1)
print(qb_epa.head(20))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# EPA by down
epa_by_down <- pbp %>%
filter(!is.na(epa), down <= 4) %>%
group_by(down) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
pass_rate = mean(play_type == "pass", na.rm = TRUE),
success_rate = mean(success)
)
print(epa_by_down)
# EPA by distance buckets
epa_by_distance <- pbp %>%
filter(!is.na(epa), down == 1) %>%
mutate(
distance_bucket = case_when(
ydstogo <= 3 ~ "1-3 yards",
ydstogo <= 7 ~ "4-7 yards",
ydstogo <= 10 ~ "8-10 yards",
TRUE ~ "11+ yards"
)
) %>%
group_by(distance_bucket) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
success_rate = mean(success)
)
# EPA by field position
epa_by_field <- pbp %>%
filter(!is.na(epa)) %>%
mutate(
field_zone = case_when(
yardline_100 >= 80 ~ "Own 1-20",
yardline_100 >= 60 ~ "Own 21-40",
yardline_100 >= 40 ~ "Midfield",
yardline_100 >= 20 ~ "Opp 21-40",
TRUE ~ "Red Zone"
)
) %>%
group_by(field_zone) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
td_rate = mean(touchdown, na.rm = TRUE)
)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# EPA by down
plays = pbp[(pbp["epa"].notna()) & (pbp["down"] <= 4)]
epa_by_down = (plays.groupby("down")
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
success_rate=("success", "mean")
)
.reset_index())
print("EPA by Down:")
print(epa_by_down)
# EPA by field position
def get_field_zone(yard):
if yard >= 80: return "Own 1-20"
elif yard >= 60: return "Own 21-40"
elif yard >= 40: return "Midfield"
elif yard >= 20: return "Opp 21-40"
else: return "Red Zone"
plays["field_zone"] = plays["yardline_100"].apply(get_field_zone)
epa_by_field = (plays.groupby("field_zone")
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
td_rate=("touchdown", "mean")
)
.reset_index())
print("\nEPA by Field Position:")
print(epa_by_field)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Game-level EPA
game_epa <- pbp %>%
filter(!is.na(epa)) %>%
group_by(game_id, posteam, week) %>%
summarize(
epa_per_play = mean(epa),
.groups = "drop"
)
# Calculate 4-game rolling average
rolling_epa <- game_epa %>%
arrange(posteam, week) %>%
group_by(posteam) %>%
mutate(
rolling_4_epa = zoo::rollmean(epa_per_play, k = 4,
fill = NA, align = "right")
) %>%
ungroup()
# Find teams that improved most
improvement <- rolling_epa %>%
filter(week %in% c(5, 17)) %>% # After 4 games vs end
select(posteam, week, rolling_4_epa) %>%
pivot_wider(names_from = week, values_from = rolling_4_epa,
names_prefix = "week_") %>%
mutate(improvement = week_17 - week_5) %>%
arrange(desc(improvement))
print(improvement)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Game-level EPA
game_epa = (pbp[pbp["epa"].notna()]
.groupby(["game_id", "posteam", "week"])
.agg(epa_per_play=("epa", "mean"))
.reset_index())
# Calculate 4-game rolling average
game_epa = game_epa.sort_values(["posteam", "week"])
game_epa["rolling_4_epa"] = (game_epa.groupby("posteam")
["epa_per_play"].transform(lambda x: x.rolling(4).mean()))
# Find teams that improved most
early = game_epa[game_epa["week"] == 5][["posteam", "rolling_4_epa"]]
late = game_epa[game_epa["week"] == 17][["posteam", "rolling_4_epa"]]
improvement = early.merge(late, on="posteam", suffixes=("_early", "_late"))
improvement["change"] = improvement["rolling_4_epa_late"] - improvement["rolling_4_epa_early"]
improvement = improvement.sort_values("change", ascending=False)
print(improvement)
nflfastR
tidyverse
zoo
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# EPA by quarter and play type
epa_by_qtr <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run"), qtr <= 4) %>%
group_by(qtr, play_type) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
success_rate = mean(success),
.groups = "drop"
)
print(epa_by_qtr)
# Pass vs Run EPA advantage by quarter
epa_by_qtr %>%
pivot_wider(names_from = play_type, values_from = c(avg_epa, success_rate)) %>%
mutate(pass_advantage = avg_epa_pass - avg_epa_run)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["qtr"] <= 4)]
# EPA by quarter and play type
epa_by_qtr = (plays.groupby(["qtr", "play_type"])
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
success_rate=("success", "mean")
)
.reset_index())
print("EPA by Quarter and Play Type:")
print(epa_by_qtr)
# Pass vs Run comparison
pivot = epa_by_qtr.pivot(index="qtr", columns="play_type", values="avg_epa")
pivot["pass_advantage"] = pivot["pass"] - pivot["run"]
print("\nPass EPA Advantage by Quarter:")
print(pivot)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Red zone analysis (inside opponent 20)
red_zone <- pbp %>%
filter(!is.na(epa), yardline_100 <= 20, play_type %in% c("pass", "run"))
# Team red zone efficiency
team_rz <- red_zone %>%
group_by(posteam) %>%
summarize(
rz_plays = n(),
rz_epa = mean(epa),
rz_td_rate = mean(touchdown, na.rm = TRUE),
rz_success_rate = mean(success),
pass_rate = mean(play_type == "pass")
) %>%
arrange(desc(rz_td_rate))
print(team_rz)
# Red zone vs rest of field comparison
rbind(
red_zone %>% summarize(zone = "Red Zone", epa = mean(epa), success = mean(success)),
pbp %>% filter(yardline_100 > 20, !is.na(epa)) %>%
summarize(zone = "Outside RZ", epa = mean(epa), success = mean(success))
)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Red zone plays (inside opponent 20)
red_zone = pbp[(pbp["epa"].notna()) &
(pbp["yardline_100"] <= 20) &
(pbp["play_type"].isin(["pass", "run"]))]
# Team red zone efficiency
team_rz = (red_zone.groupby("posteam")
.agg(
rz_plays=("epa", "count"),
rz_epa=("epa", "mean"),
rz_td_rate=("touchdown", "mean"),
rz_success_rate=("success", "mean")
)
.reset_index()
.sort_values("rz_td_rate", ascending=False))
print("Team Red Zone Efficiency:")
print(team_rz)
# Red zone vs outside comparison
outside_rz = pbp[(pbp["epa"].notna()) & (pbp["yardline_100"] > 20)]
print(f"\nRed Zone EPA: {red_zone['epa'].mean():.3f}")
print(f"Outside RZ EPA: {outside_rz['epa'].mean():.3f}")
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Third down analysis
third_down <- pbp %>%
filter(!is.na(epa), down == 3)
# By distance bucket
third_by_dist <- third_down %>%
mutate(
dist_bucket = case_when(
ydstogo <= 3 ~ "Short (1-3)",
ydstogo <= 6 ~ "Medium (4-6)",
ydstogo <= 10 ~ "Long (7-10)",
TRUE ~ "Very Long (11+)"
)
) %>%
group_by(dist_bucket) %>%
summarize(
attempts = n(),
conversion_rate = mean(first_down, na.rm = TRUE),
avg_epa = mean(epa),
pass_rate = mean(play_type == "pass", na.rm = TRUE)
)
print(third_by_dist)
# Team third down rankings
team_3rd <- third_down %>%
group_by(posteam) %>%
summarize(
attempts = n(),
conversions = sum(first_down, na.rm = TRUE),
conv_rate = mean(first_down, na.rm = TRUE),
epa_per_play = mean(epa)
) %>%
arrange(desc(conv_rate))
print(team_3rd)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Third down plays
third_down = pbp[(pbp["epa"].notna()) & (pbp["down"] == 3)]
# Distance buckets
def dist_bucket(ydstogo):
if ydstogo <= 3: return "Short (1-3)"
elif ydstogo <= 6: return "Medium (4-6)"
elif ydstogo <= 10: return "Long (7-10)"
else: return "Very Long (11+)"
third_down["dist_bucket"] = third_down["ydstogo"].apply(dist_bucket)
# Analysis by distance
third_by_dist = (third_down.groupby("dist_bucket")
.agg(
attempts=("epa", "count"),
conversion_rate=("first_down", "mean"),
avg_epa=("epa", "mean")
)
.reset_index())
print("Third Down by Distance:")
print(third_by_dist)
# Team rankings
team_3rd = (third_down.groupby("posteam")
.agg(
attempts=("epa", "count"),
conv_rate=("first_down", "mean"),
epa_per_play=("epa", "mean")
)
.reset_index()
.sort_values("conv_rate", ascending=False))
print("\nTeam Third Down Rankings:")
print(team_3rd.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Define garbage time (various definitions)
# Common: Win prob < 10% or > 90% in 4th quarter
pbp_filtered <- pbp %>%
filter(!is.na(epa)) %>%
mutate(
garbage_time = case_when(
qtr == 4 & (wp < 0.10 | wp > 0.90) ~ TRUE,
qtr == 4 & abs(score_differential) > 21 ~ TRUE,
TRUE ~ FALSE
)
)
# Compare EPA with and without garbage time
comparison <- pbp_filtered %>%
filter(play_type %in% c("pass", "run")) %>%
group_by(posteam, garbage_time) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
.groups = "drop"
) %>%
pivot_wider(names_from = garbage_time, values_from = c(plays, epa_per_play))
# Teams with biggest garbage time inflation
comparison %>%
mutate(epa_diff = epa_per_play_TRUE - epa_per_play_FALSE) %>%
arrange(desc(epa_diff)) %>%
select(posteam, epa_per_play_FALSE, epa_per_play_TRUE, epa_diff)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter valid plays
plays = pbp[pbp["epa"].notna()].copy()
# Define garbage time
plays["garbage_time"] = (
((plays["qtr"] == 4) & ((plays["wp"] < 0.10) | (plays["wp"] > 0.90))) |
((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21))
)
# Compare with and without garbage time
plays_filtered = plays[plays["play_type"].isin(["pass", "run"])]
# EPA without garbage time
clean_epa = (plays_filtered[~plays_filtered["garbage_time"]]
.groupby("posteam")["epa"].mean())
# EPA with garbage time
all_epa = plays_filtered.groupby("posteam")["epa"].mean()
# Compare
comparison = pd.DataFrame({
"clean_epa": clean_epa,
"all_epa": all_epa
})
comparison["garbage_inflation"] = comparison["all_epa"] - comparison["clean_epa"]
comparison = comparison.sort_values("garbage_inflation", ascending=False)
print("Garbage Time EPA Inflation:")
print(comparison.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load two consecutive seasons
pbp_2022 <- load_pbp(2022)
pbp_2023 <- load_pbp(2023)
# Calculate team EPA for each season
get_team_epa <- function(pbp, season) {
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(
epa_per_play = mean(epa),
pass_epa = mean(epa[play_type == "pass"]),
rush_epa = mean(epa[play_type == "run"]),
.groups = "drop"
) %>%
mutate(season = season)
}
epa_2022 <- get_team_epa(pbp_2022, 2022)
epa_2023 <- get_team_epa(pbp_2023, 2023)
# Join and calculate correlation
combined <- epa_2022 %>%
inner_join(epa_2023, by = "posteam", suffix = c("_2022", "_2023"))
# Correlations
cat("Overall EPA correlation:", cor(combined$epa_per_play_2022, combined$epa_per_play_2023), "\n")
cat("Pass EPA correlation:", cor(combined$pass_epa_2022, combined$pass_epa_2023), "\n")
cat("Rush EPA correlation:", cor(combined$rush_epa_2022, combined$rush_epa_2023), "\n")
import nfl_data_py as nfl
import pandas as pd
# Load two consecutive seasons
pbp_2022 = nfl.import_pbp_data([2022])
pbp_2023 = nfl.import_pbp_data([2023])
def get_team_epa(pbp):
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))]
return plays.groupby("posteam")["epa"].mean()
epa_2022 = get_team_epa(pbp_2022)
epa_2023 = get_team_epa(pbp_2023)
# Combine
combined = pd.DataFrame({
"epa_2022": epa_2022,
"epa_2023": epa_2023
}).dropna()
# Calculate correlation
correlation = combined["epa_2022"].corr(combined["epa_2023"])
print(f"Year-over-Year EPA Correlation: {correlation:.3f}")
# Show biggest changes
combined["change"] = combined["epa_2023"] - combined["epa_2022"]
print("\nBiggest Improvements:")
print(combined.nlargest(5, "change"))
print("\nBiggest Declines:")
print(combined.nsmallest(5, "change"))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Define explosive plays (20+ yards)
explosive <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
mutate(
explosive = yards_gained >= 20,
big_play = yards_gained >= 40
)
# Team explosive play rates
team_explosive <- explosive %>%
group_by(posteam) %>%
summarize(
total_plays = n(),
explosive_plays = sum(explosive),
explosive_rate = mean(explosive) * 100,
explosive_epa = sum(epa[explosive]),
non_explosive_epa = sum(epa[!explosive]),
epa_from_explosive_pct = explosive_epa / sum(epa) * 100
) %>%
arrange(desc(explosive_rate))
print(team_explosive)
# Explosive vs non-explosive EPA comparison
cat("\nAverage EPA on explosive plays:",
mean(explosive$epa[explosive$explosive]), "\n")
cat("Average EPA on non-explosive plays:",
mean(explosive$epa[!explosive$explosive]), "\n")
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))].copy()
# Define explosive plays
plays["explosive"] = plays["yards_gained"] >= 20
# Team explosive play analysis
team_explosive = (plays.groupby("posteam")
.agg(
total_plays=("epa", "count"),
explosive_plays=("explosive", "sum"),
total_epa=("epa", "sum")
)
.reset_index())
team_explosive["explosive_rate"] = (team_explosive["explosive_plays"] /
team_explosive["total_plays"] * 100)
team_explosive = team_explosive.sort_values("explosive_rate", ascending=False)
print("Team Explosive Play Rates:")
print(team_explosive)
# EPA comparison
explosive_epa = plays[plays["explosive"]]["epa"].mean()
non_explosive_epa = plays[~plays["explosive"]]["epa"].mean()
print(f"\nAverage EPA on explosive plays: {explosive_epa:.3f}")
print(f"Average EPA on non-explosive plays: {non_explosive_epa:.3f}")
nflfastR
tidyverse
nfl_data_py
pandas
nflfastR - Play-by-play data with EPAnflplotR - NFL team logos & plottingtidyverse - Data manipulation & visualizationggplot2 - Advanced visualizationsnfl_data_py - NFL data (nflverse compatible)pandas - Data manipulationmatplotlib - Visualizationsscikit-learn - Machine learningLearn the theory behind these techniques in our comprehensive tutorial series
Browse Tutorials