Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.
Comprehensive team-level analysis and comparisons
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate weekly EPA for each team
weekly_epa <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam, week) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
)
# Calculate cumulative and rolling metrics
team_trends <- weekly_epa %>%
group_by(posteam) %>%
arrange(week) %>%
mutate(
cumulative_epa = cumsum(epa_per_play) / row_number(),
rolling_3wk = zoo::rollmean(epa_per_play, k = 3, fill = NA, align = "right")
) %>%
ungroup()
# Find improving and declining teams
week_1_3 <- team_trends %>%
filter(week <= 3) %>%
group_by(posteam) %>%
summarize(early_epa = mean(epa_per_play))
week_15_plus <- team_trends %>%
filter(week >= 15) %>%
group_by(posteam) %>%
summarize(late_epa = mean(epa_per_play))
trends <- week_1_3 %>%
inner_join(week_15_plus, by = "posteam") %>%
mutate(improvement = late_epa - early_epa) %>%
arrange(desc(improvement))
print("Most Improved Teams:")
print(trends %>% head(5))
print("\nDeclined Most:")
print(trends %>% tail(5))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Weekly EPA
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
weekly_epa = (plays.groupby(["posteam", "week"])
.agg(
plays=("epa", "count"),
epa_per_play=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
# Compare early vs late season
early = weekly_epa[weekly_epa["week"] <= 3].groupby("posteam")["epa_per_play"].mean()
late = weekly_epa[weekly_epa["week"] >= 15].groupby("posteam")["epa_per_play"].mean()
trends = pd.DataFrame({"early_epa": early, "late_epa": late})
trends["improvement"] = trends["late_epa"] - trends["early_epa"]
trends = trends.sort_values("improvement", ascending=False).reset_index()
print("Most Improved Teams:")
print(trends.head(5))
print("\nDeclined Most:")
print(trends.tail(5))
nflfastR
tidyverse
zoo
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Home vs Away EPA
home_away <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
mutate(location = if_else(posteam == home_team, "Home", "Away")) %>%
group_by(posteam, location) %>%
summarize(
plays = n(),
epa = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
pivot_wider(
names_from = location,
values_from = c(plays, epa, success_rate)
) %>%
mutate(
home_advantage = epa_Home - epa_Away
) %>%
arrange(desc(home_advantage))
print("Home vs Away Performance:")
print(home_away %>% select(posteam, epa_Home, epa_Away, home_advantage))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Home vs Away
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))].copy()
plays["location"] = plays.apply(lambda x: "Home" if x["posteam"] == x["home_team"] else "Away", axis=1)
home_away = (plays.groupby(["posteam", "location"])
.agg(
plays=("epa", "count"),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
# Pivot
home = home_away[home_away["location"] == "Home"].set_index("posteam")
away = home_away[home_away["location"] == "Away"].set_index("posteam")
comparison = pd.DataFrame({
"Home EPA": home["epa"],
"Away EPA": away["epa"]
})
comparison["Home Advantage"] = comparison["Home EPA"] - comparison["Away EPA"]
comparison = comparison.sort_values("Home Advantage", ascending=False).reset_index()
print("Home vs Away Performance:")
print(comparison)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Calculate point differential stats
point_diff <- schedules %>%
filter(!is.na(result)) %>%
pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
mutate(
points_for = if_else(location == "home_team", home_score, away_score),
points_against = if_else(location == "home_team", away_score, home_score),
point_diff = points_for - points_against,
win = point_diff > 0
) %>%
group_by(team) %>%
summarize(
games = n(),
wins = sum(win),
total_pf = sum(points_for),
total_pa = sum(points_against),
total_diff = sum(point_diff),
ppg = mean(points_for),
papg = mean(points_against),
avg_diff = mean(point_diff),
.groups = "drop"
) %>%
arrange(desc(total_diff))
print("Point Differential Rankings:")
print(point_diff)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2023])
# Calculate for each team
games = schedules[schedules["result"].notna()].copy()
# Home games
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "points_for", "points_against"]
# Away games
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "points_for", "points_against"]
all_games = pd.concat([home, away])
all_games["point_diff"] = all_games["points_for"] - all_games["points_against"]
all_games["win"] = all_games["point_diff"] > 0
point_diff = (all_games.groupby("team")
.agg(
games=("point_diff", "count"),
wins=("win", "sum"),
total_pf=("points_for", "sum"),
total_pa=("points_against", "sum"),
total_diff=("point_diff", "sum"),
ppg=("points_for", "mean"),
avg_diff=("point_diff", "mean")
)
.reset_index()
.sort_values("total_diff", ascending=False))
print("Point Differential Rankings:")
print(point_diff)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Calculate team scoring
team_scoring <- schedules %>%
filter(!is.na(result)) %>%
pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
mutate(
pf = if_else(location == "home_team", home_score, away_score),
pa = if_else(location == "home_team", away_score, home_score),
win = pf > pa
) %>%
group_by(team) %>%
summarize(
games = n(),
actual_wins = sum(win),
points_for = sum(pf),
points_against = sum(pa),
.groups = "drop"
)
# Pythagorean expectation (exponent = 2.37 for NFL)
team_scoring <- team_scoring %>%
mutate(
pyth_exp = points_for^2.37 / (points_for^2.37 + points_against^2.37),
expected_wins = pyth_exp * games,
luck = actual_wins - expected_wins
) %>%
arrange(desc(luck))
print("Pythagorean Win Expectation:")
print(team_scoring %>%
select(team, actual_wins, expected_wins, luck) %>%
mutate(across(c(expected_wins, luck), ~round(., 1))))
import nfl_data_py as nfl
import pandas as pd
import numpy as np
schedules = nfl.import_schedules([2023])
games = schedules[schedules["result"].notna()].copy()
# Home games
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "pf", "pa"]
# Away games
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "pf", "pa"]
all_games = pd.concat([home, away])
all_games["win"] = all_games["pf"] > all_games["pa"]
# Aggregate
team_scoring = (all_games.groupby("team")
.agg(
games=("win", "count"),
actual_wins=("win", "sum"),
points_for=("pf", "sum"),
points_against=("pa", "sum")
)
.reset_index())
# Pythagorean expectation
exp = 2.37
team_scoring["pyth_exp"] = (team_scoring["points_for"]**exp /
(team_scoring["points_for"]**exp + team_scoring["points_against"]**exp))
team_scoring["expected_wins"] = team_scoring["pyth_exp"] * team_scoring["games"]
team_scoring["luck"] = team_scoring["actual_wins"] - team_scoring["expected_wins"]
team_scoring = team_scoring.sort_values("luck", ascending=False)
print("Pythagorean Win Expectation:")
print(team_scoring[["team", "actual_wins", "expected_wins", "luck"]].round(1))
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
teams <- load_teams()
# Add division info
games <- schedules %>%
filter(!is.na(result)) %>%
left_join(teams %>% select(team_abbr, team_division),
by = c("home_team" = "team_abbr")) %>%
rename(home_div = team_division) %>%
left_join(teams %>% select(team_abbr, team_division),
by = c("away_team" = "team_abbr")) %>%
rename(away_div = team_division) %>%
mutate(division_game = home_div == away_div)
# Create team-game records
home <- games %>%
select(team = home_team, opponent = away_team, pf = home_score,
pa = away_score, division_game)
away <- games %>%
select(team = away_team, opponent = home_team, pf = away_score,
pa = home_score, division_game)
all_games <- bind_rows(home, away) %>%
mutate(
win = pf > pa,
margin = pf - pa
)
# Division vs Non-Division performance
div_perf <- all_games %>%
group_by(team, division_game) %>%
summarize(
games = n(),
wins = sum(win),
win_pct = mean(win) * 100,
avg_margin = mean(margin),
.groups = "drop"
) %>%
pivot_wider(
names_from = division_game,
values_from = c(games, wins, win_pct, avg_margin),
names_prefix = "div_"
)
print("Division vs Non-Division Performance:")
print(div_perf %>%
select(team, win_pct_div_TRUE, win_pct_div_FALSE) %>%
mutate(div_advantage = win_pct_div_TRUE - win_pct_div_FALSE) %>%
arrange(desc(div_advantage)))
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2023])
teams = nfl.import_team_desc()
# Add division info
games = schedules[schedules["result"].notna()].copy()
games = games.merge(teams[["team_abbr", "team_division"]],
left_on="home_team", right_on="team_abbr", how="left")
games = games.rename(columns={"team_division": "home_div"}).drop("team_abbr", axis=1)
games = games.merge(teams[["team_abbr", "team_division"]],
left_on="away_team", right_on="team_abbr", how="left")
games = games.rename(columns={"team_division": "away_div"}).drop("team_abbr", axis=1)
games["division_game"] = games["home_div"] == games["away_div"]
# Create team-game records
home = games[["home_team", "away_team", "home_score", "away_score", "division_game"]].copy()
home.columns = ["team", "opponent", "pf", "pa", "division_game"]
away = games[["away_team", "home_team", "away_score", "home_score", "division_game"]].copy()
away.columns = ["team", "opponent", "pf", "pa", "division_game"]
all_games = pd.concat([home, away])
all_games["win"] = all_games["pf"] > all_games["pa"]
all_games["margin"] = all_games["pf"] - all_games["pa"]
# Division vs Non-Division
div_perf = (all_games.groupby(["team", "division_game"])
.agg(games=("win", "count"), wins=("win", "sum"),
win_pct=("win", lambda x: x.mean() * 100), avg_margin=("margin", "mean"))
.reset_index())
div_games = div_perf[div_perf["division_game"]].set_index("team")["win_pct"]
non_div = div_perf[~div_perf["division_game"]].set_index("team")["win_pct"]
comparison = pd.DataFrame({"Division": div_games, "Non-Division": non_div})
comparison["Advantage"] = comparison["Division"] - comparison["Non-Division"]
comparison = comparison.sort_values("Advantage", ascending=False).reset_index()
print("Division vs Non-Division Performance:")
print(comparison)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load multiple years for playoff sample
pbp <- load_pbp(2020:2023)
# Separate regular season and playoffs
perf_comparison <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(epa)) %>%
mutate(game_type = if_else(week <= 18, "Regular Season", "Playoffs")) %>%
group_by(posteam, game_type) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
success_rate = mean(success, na.rm = TRUE) * 100,
pass_rate = mean(play_type == "pass") * 100,
.groups = "drop"
) %>%
filter(plays >= 100) # Minimum plays threshold
# Compare teams with playoff experience
playoff_teams <- perf_comparison %>%
filter(game_type == "Playoffs") %>%
pull(posteam) %>%
unique()
comparison <- perf_comparison %>%
filter(posteam %in% playoff_teams) %>%
select(posteam, game_type, epa_per_play) %>%
pivot_wider(names_from = game_type, values_from = epa_per_play) %>%
mutate(
playoff_bump = Playoffs - `Regular Season`
) %>%
arrange(desc(playoff_bump))
print("Playoff vs Regular Season EPA (2020-2023):")
print(comparison)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
# Filter and categorize
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["epa"].notna())].copy()
plays["game_type"] = plays["week"].apply(lambda x: "Regular Season" if x <= 18 else "Playoffs")
# Performance by game type
perf = (plays.groupby(["posteam", "game_type"])
.agg(
plays=("epa", "count"),
epa_per_play=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
)
.reset_index())
perf = perf[perf["plays"] >= 100]
# Teams with playoff experience
playoff_teams = perf[perf["game_type"] == "Playoffs"]["posteam"].unique()
comparison = perf[perf["posteam"].isin(playoff_teams)].copy()
# Pivot
reg = comparison[comparison["game_type"] == "Regular Season"].set_index("posteam")["epa_per_play"]
playoff = comparison[comparison["game_type"] == "Playoffs"].set_index("posteam")["epa_per_play"]
result = pd.DataFrame({"Regular Season": reg, "Playoffs": playoff})
result["Playoff Bump"] = result["Playoffs"] - result["Regular Season"]
result = result.sort_values("Playoff Bump", ascending=False).reset_index()
print("Playoff vs Regular Season EPA (2020-2023):")
print(result)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Calculate team records
games <- schedules %>%
filter(!is.na(result))
home <- games %>%
select(team = home_team, pf = home_score, pa = away_score) %>%
mutate(win = pf > pa)
away <- games %>%
select(team = away_team, pf = away_score, pa = home_score) %>%
mutate(win = pf > pa)
records <- bind_rows(home, away) %>%
group_by(team) %>%
summarize(
wins = sum(win),
losses = sum(!win),
win_pct = mean(win),
.groups = "drop"
)
# Calculate SOS (average win pct of opponents)
calculate_sos <- function(team_name) {
opponents <- c(
games$away_team[games$home_team == team_name],
games$home_team[games$away_team == team_name]
)
opp_records <- records %>%
filter(team %in% opponents)
mean(opp_records$win_pct)
}
sos <- records %>%
rowwise() %>%
mutate(
sos = calculate_sos(team),
sos_rank = NA
) %>%
ungroup() %>%
mutate(sos_rank = rank(-sos))
print("Strength of Schedule Rankings:")
print(sos %>%
select(team, wins, losses, win_pct, sos) %>%
arrange(desc(sos)) %>%
mutate(
sos = round(sos, 3),
rank = row_number()
))
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2023])
games = schedules[schedules["result"].notna()].copy()
# Calculate team records
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "pf", "pa"]
home["win"] = home["pf"] > home["pa"]
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "pf", "pa"]
away["win"] = away["pf"] > away["pa"]
all_games = pd.concat([home, away])
records = (all_games.groupby("team")
.agg(wins=("win", "sum"), losses=("win", lambda x: (~x).sum()),
win_pct=("win", "mean"))
.reset_index())
# Calculate SOS
def calc_sos(team):
home_opps = games[games["home_team"] == team]["away_team"].tolist()
away_opps = games[games["away_team"] == team]["home_team"].tolist()
opponents = home_opps + away_opps
opp_records = records[records["team"].isin(opponents)]
return opp_records["win_pct"].mean()
records["sos"] = records["team"].apply(calc_sos)
records = records.sort_values("sos", ascending=False)
records["rank"] = range(1, len(records) + 1)
print("Strength of Schedule Rankings:")
print(records[["rank", "team", "wins", "losses", "sos"]].round(3))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load two seasons
pbp_prev <- load_pbp(2022)
pbp_curr <- load_pbp(2023)
# Calculate EPA per play by team for each year
calc_team_epa <- function(pbp, year) {
pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(epa)) %>%
group_by(team = posteam) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
success_rate = mean(success, na.rm = TRUE) * 100,
pass_rate = mean(play_type == "pass") * 100,
.groups = "drop"
) %>%
mutate(season = year)
}
epa_2022 <- calc_team_epa(pbp_prev, 2022)
epa_2023 <- calc_team_epa(pbp_curr, 2023)
# Compare
improvement <- epa_2022 %>%
select(team, epa_2022 = epa_per_play, sr_2022 = success_rate) %>%
inner_join(
epa_2023 %>% select(team, epa_2023 = epa_per_play, sr_2023 = success_rate),
by = "team"
) %>%
mutate(
epa_change = epa_2023 - epa_2022,
sr_change = sr_2023 - sr_2022
) %>%
arrange(desc(epa_change))
print("Year-over-Year EPA Change (2022 to 2023):")
print(improvement %>%
mutate(across(where(is.numeric), ~round(., 3))))
import nfl_data_py as nfl
import pandas as pd
# Load two seasons
pbp_2022 = nfl.import_pbp_data([2022])
pbp_2023 = nfl.import_pbp_data([2023])
def calc_team_epa(pbp, year):
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["epa"].notna())]
return (plays.groupby("posteam")
.agg(
plays=("epa", "count"),
epa_per_play=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
)
.reset_index()
.rename(columns={"posteam": "team"})
.assign(season=year))
epa_2022 = calc_team_epa(pbp_2022, 2022)
epa_2023 = calc_team_epa(pbp_2023, 2023)
# Merge and compare
comparison = epa_2022[["team", "epa_per_play", "success_rate"]].merge(
epa_2023[["team", "epa_per_play", "success_rate"]],
on="team",
suffixes=("_2022", "_2023")
)
comparison["epa_change"] = comparison["epa_per_play_2023"] - comparison["epa_per_play_2022"]
comparison["sr_change"] = comparison["success_rate_2023"] - comparison["success_rate_2022"]
comparison = comparison.sort_values("epa_change", ascending=False)
print("Year-over-Year EPA Change (2022 to 2023):")
print(comparison.round(3))
nflfastR
tidyverse
nfl_data_py
pandas
nflfastR - Play-by-play data with EPAnflplotR - NFL team logos & plottingtidyverse - Data manipulation & visualizationggplot2 - Advanced visualizationsnfl_data_py - NFL data (nflverse compatible)pandas - Data manipulationmatplotlib - Visualizationsscikit-learn - Machine learningLearn the theory behind these techniques in our comprehensive tutorial series
Browse Tutorials