Code Examples

Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.

122 Examples
R & Python Support: All examples include both R and Python versions. Click the tabs to switch between languages. Use the copy button to copy code to clipboard.

EPA Analysis

Calculate and analyze Expected Points Added for teams, players, and situations

Calculate Team EPA per Play
Compute offensive and defensive EPA per play rankings for all teams.
Intermediate
library(nflfastR)
library(tidyverse)

# Load data
pbp <- load_pbp(2023)

# Filter to pass and rush plays
plays <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run"))

# Calculate offensive EPA/play
offense_epa <- plays %>%
  group_by(posteam) %>%
  summarize(
    plays = n(),
    total_epa = sum(epa),
    epa_per_play = mean(epa),
    pass_epa = mean(epa[play_type == "pass"]),
    rush_epa = mean(epa[play_type == "run"]),
    success_rate = mean(success)
  ) %>%
  arrange(desc(epa_per_play))

# Calculate defensive EPA/play (lower is better)
defense_epa <- plays %>%
  group_by(defteam) %>%
  summarize(
    plays = n(),
    epa_allowed = mean(epa),
    pass_epa_allowed = mean(epa[play_type == "pass"]),
    rush_epa_allowed = mean(epa[play_type == "run"])
  ) %>%
  arrange(epa_allowed)

print(offense_epa)
print(defense_epa)
import nfl_data_py as nfl
import pandas as pd

# Load data
pbp = nfl.import_pbp_data([2023])

# Filter to pass and rush plays
plays = pbp[(pbp["epa"].notna()) &
            (pbp["play_type"].isin(["pass", "run"]))]

# Calculate offensive EPA/play
offense_epa = (plays.groupby("posteam")
    .agg(
        plays=("epa", "count"),
        total_epa=("epa", "sum"),
        epa_per_play=("epa", "mean"),
        success_rate=("success", "mean")
    )
    .sort_values("epa_per_play", ascending=False)
    .reset_index())

# Calculate defensive EPA/play
defense_epa = (plays.groupby("defteam")
    .agg(epa_allowed=("epa", "mean"))
    .sort_values("epa_allowed")
    .reset_index())

print("Offensive EPA/Play:")
print(offense_epa.head(10))
print("\nDefensive EPA/Play:")
print(defense_epa.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas
Quarterback EPA Rankings
Rank quarterbacks by EPA per dropback with minimum attempt filters.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# QB EPA analysis
qb_epa <- pbp %>%
  filter(!is.na(epa), !is.na(passer_player_id)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    dropbacks = n(),
    total_epa = sum(epa),
    epa_per_dropback = mean(epa),
    cpoe = mean(cpoe, na.rm = TRUE),
    success_rate = mean(success),
    avg_air_yards = mean(air_yards, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(dropbacks >= 200) %>%
  arrange(desc(epa_per_dropback))

# Add rank
qb_epa <- qb_epa %>%
  mutate(rank = row_number())

print(qb_epa, n = 20)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter to pass plays with QB identified
pass_plays = pbp[(pbp["epa"].notna()) &
                 (pbp["passer_player_id"].notna())]

# QB EPA analysis
qb_epa = (pass_plays.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        dropbacks=("epa", "count"),
        total_epa=("epa", "sum"),
        epa_per_dropback=("epa", "mean"),
        cpoe=("cpoe", "mean"),
        success_rate=("success", "mean"),
        avg_air_yards=("air_yards", "mean")
    )
    .reset_index())

# Filter min dropbacks and rank
qb_epa = (qb_epa[qb_epa["dropbacks"] >= 200]
    .sort_values("epa_per_dropback", ascending=False)
    .reset_index(drop=True))
qb_epa["rank"] = range(1, len(qb_epa) + 1)

print(qb_epa.head(20))
Packages: nflfastR tidyverse nfl_data_py pandas
EPA by Down and Distance
Analyze expected points by game situation (down, distance, field position).
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# EPA by down
epa_by_down <- pbp %>%
  filter(!is.na(epa), down <= 4) %>%
  group_by(down) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    pass_rate = mean(play_type == "pass", na.rm = TRUE),
    success_rate = mean(success)
  )

print(epa_by_down)

# EPA by distance buckets
epa_by_distance <- pbp %>%
  filter(!is.na(epa), down == 1) %>%
  mutate(
    distance_bucket = case_when(
      ydstogo <= 3 ~ "1-3 yards",
      ydstogo <= 7 ~ "4-7 yards",
      ydstogo <= 10 ~ "8-10 yards",
      TRUE ~ "11+ yards"
    )
  ) %>%
  group_by(distance_bucket) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    success_rate = mean(success)
  )

# EPA by field position
epa_by_field <- pbp %>%
  filter(!is.na(epa)) %>%
  mutate(
    field_zone = case_when(
      yardline_100 >= 80 ~ "Own 1-20",
      yardline_100 >= 60 ~ "Own 21-40",
      yardline_100 >= 40 ~ "Midfield",
      yardline_100 >= 20 ~ "Opp 21-40",
      TRUE ~ "Red Zone"
    )
  ) %>%
  group_by(field_zone) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    td_rate = mean(touchdown, na.rm = TRUE)
  )
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# EPA by down
plays = pbp[(pbp["epa"].notna()) & (pbp["down"] <= 4)]
epa_by_down = (plays.groupby("down")
    .agg(
        plays=("epa", "count"),
        avg_epa=("epa", "mean"),
        success_rate=("success", "mean")
    )
    .reset_index())
print("EPA by Down:")
print(epa_by_down)

# EPA by field position
def get_field_zone(yard):
    if yard >= 80: return "Own 1-20"
    elif yard >= 60: return "Own 21-40"
    elif yard >= 40: return "Midfield"
    elif yard >= 20: return "Opp 21-40"
    else: return "Red Zone"

plays["field_zone"] = plays["yardline_100"].apply(get_field_zone)
epa_by_field = (plays.groupby("field_zone")
    .agg(
        plays=("epa", "count"),
        avg_epa=("epa", "mean"),
        td_rate=("touchdown", "mean")
    )
    .reset_index())
print("\nEPA by Field Position:")
print(epa_by_field)
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Rolling EPA Trends
Calculate rolling EPA averages to track team performance over time.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Game-level EPA
game_epa <- pbp %>%
  filter(!is.na(epa)) %>%
  group_by(game_id, posteam, week) %>%
  summarize(
    epa_per_play = mean(epa),
    .groups = "drop"
  )

# Calculate 4-game rolling average
rolling_epa <- game_epa %>%
  arrange(posteam, week) %>%
  group_by(posteam) %>%
  mutate(
    rolling_4_epa = zoo::rollmean(epa_per_play, k = 4,
                                   fill = NA, align = "right")
  ) %>%
  ungroup()

# Find teams that improved most
improvement <- rolling_epa %>%
  filter(week %in% c(5, 17)) %>%  # After 4 games vs end
  select(posteam, week, rolling_4_epa) %>%
  pivot_wider(names_from = week, values_from = rolling_4_epa,
              names_prefix = "week_") %>%
  mutate(improvement = week_17 - week_5) %>%
  arrange(desc(improvement))

print(improvement)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Game-level EPA
game_epa = (pbp[pbp["epa"].notna()]
    .groupby(["game_id", "posteam", "week"])
    .agg(epa_per_play=("epa", "mean"))
    .reset_index())

# Calculate 4-game rolling average
game_epa = game_epa.sort_values(["posteam", "week"])
game_epa["rolling_4_epa"] = (game_epa.groupby("posteam")
    ["epa_per_play"].transform(lambda x: x.rolling(4).mean()))

# Find teams that improved most
early = game_epa[game_epa["week"] == 5][["posteam", "rolling_4_epa"]]
late = game_epa[game_epa["week"] == 17][["posteam", "rolling_4_epa"]]

improvement = early.merge(late, on="posteam", suffixes=("_early", "_late"))
improvement["change"] = improvement["rolling_4_epa_late"] - improvement["rolling_4_epa_early"]
improvement = improvement.sort_values("change", ascending=False)

print(improvement)
Packages: nflfastR tidyverse zoo nfl_data_py pandas
EPA by Play Type and Quarter
Analyze how EPA varies by play type across different game quarters.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# EPA by quarter and play type
epa_by_qtr <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run"), qtr <= 4) %>%
  group_by(qtr, play_type) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    success_rate = mean(success),
    .groups = "drop"
  )

print(epa_by_qtr)

# Pass vs Run EPA advantage by quarter
epa_by_qtr %>%
  pivot_wider(names_from = play_type, values_from = c(avg_epa, success_rate)) %>%
  mutate(pass_advantage = avg_epa_pass - avg_epa_run)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
            (pbp["play_type"].isin(["pass", "run"])) &
            (pbp["qtr"] <= 4)]

# EPA by quarter and play type
epa_by_qtr = (plays.groupby(["qtr", "play_type"])
    .agg(
        plays=("epa", "count"),
        avg_epa=("epa", "mean"),
        success_rate=("success", "mean")
    )
    .reset_index())

print("EPA by Quarter and Play Type:")
print(epa_by_qtr)

# Pass vs Run comparison
pivot = epa_by_qtr.pivot(index="qtr", columns="play_type", values="avg_epa")
pivot["pass_advantage"] = pivot["pass"] - pivot["run"]
print("\nPass EPA Advantage by Quarter:")
print(pivot)
Packages: nflfastR tidyverse nfl_data_py pandas
Red Zone EPA Analysis
Evaluate team and player efficiency inside the opponent's 20-yard line.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Red zone analysis (inside opponent 20)
red_zone <- pbp %>%
  filter(!is.na(epa), yardline_100 <= 20, play_type %in% c("pass", "run"))

# Team red zone efficiency
team_rz <- red_zone %>%
  group_by(posteam) %>%
  summarize(
    rz_plays = n(),
    rz_epa = mean(epa),
    rz_td_rate = mean(touchdown, na.rm = TRUE),
    rz_success_rate = mean(success),
    pass_rate = mean(play_type == "pass")
  ) %>%
  arrange(desc(rz_td_rate))

print(team_rz)

# Red zone vs rest of field comparison
rbind(
  red_zone %>% summarize(zone = "Red Zone", epa = mean(epa), success = mean(success)),
  pbp %>% filter(yardline_100 > 20, !is.na(epa)) %>%
    summarize(zone = "Outside RZ", epa = mean(epa), success = mean(success))
)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Red zone plays (inside opponent 20)
red_zone = pbp[(pbp["epa"].notna()) &
               (pbp["yardline_100"] <= 20) &
               (pbp["play_type"].isin(["pass", "run"]))]

# Team red zone efficiency
team_rz = (red_zone.groupby("posteam")
    .agg(
        rz_plays=("epa", "count"),
        rz_epa=("epa", "mean"),
        rz_td_rate=("touchdown", "mean"),
        rz_success_rate=("success", "mean")
    )
    .reset_index()
    .sort_values("rz_td_rate", ascending=False))

print("Team Red Zone Efficiency:")
print(team_rz)

# Red zone vs outside comparison
outside_rz = pbp[(pbp["epa"].notna()) & (pbp["yardline_100"] > 20)]
print(f"\nRed Zone EPA: {red_zone['epa'].mean():.3f}")
print(f"Outside RZ EPA: {outside_rz['epa'].mean():.3f}")
Packages: nflfastR tidyverse nfl_data_py pandas
Third Down EPA Breakdown
Analyze third down conversion efficiency using EPA metrics.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Third down analysis
third_down <- pbp %>%
  filter(!is.na(epa), down == 3)

# By distance bucket
third_by_dist <- third_down %>%
  mutate(
    dist_bucket = case_when(
      ydstogo <= 3 ~ "Short (1-3)",
      ydstogo <= 6 ~ "Medium (4-6)",
      ydstogo <= 10 ~ "Long (7-10)",
      TRUE ~ "Very Long (11+)"
    )
  ) %>%
  group_by(dist_bucket) %>%
  summarize(
    attempts = n(),
    conversion_rate = mean(first_down, na.rm = TRUE),
    avg_epa = mean(epa),
    pass_rate = mean(play_type == "pass", na.rm = TRUE)
  )

print(third_by_dist)

# Team third down rankings
team_3rd <- third_down %>%
  group_by(posteam) %>%
  summarize(
    attempts = n(),
    conversions = sum(first_down, na.rm = TRUE),
    conv_rate = mean(first_down, na.rm = TRUE),
    epa_per_play = mean(epa)
  ) %>%
  arrange(desc(conv_rate))

print(team_3rd)
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Third down plays
third_down = pbp[(pbp["epa"].notna()) & (pbp["down"] == 3)]

# Distance buckets
def dist_bucket(ydstogo):
    if ydstogo <= 3: return "Short (1-3)"
    elif ydstogo <= 6: return "Medium (4-6)"
    elif ydstogo <= 10: return "Long (7-10)"
    else: return "Very Long (11+)"

third_down["dist_bucket"] = third_down["ydstogo"].apply(dist_bucket)

# Analysis by distance
third_by_dist = (third_down.groupby("dist_bucket")
    .agg(
        attempts=("epa", "count"),
        conversion_rate=("first_down", "mean"),
        avg_epa=("epa", "mean")
    )
    .reset_index())

print("Third Down by Distance:")
print(third_by_dist)

# Team rankings
team_3rd = (third_down.groupby("posteam")
    .agg(
        attempts=("epa", "count"),
        conv_rate=("first_down", "mean"),
        epa_per_play=("epa", "mean")
    )
    .reset_index()
    .sort_values("conv_rate", ascending=False))

print("\nTeam Third Down Rankings:")
print(team_3rd.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Garbage Time Filtering
Filter out garbage time plays to get more meaningful EPA analysis.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Define garbage time (various definitions)
# Common: Win prob < 10% or > 90% in 4th quarter
pbp_filtered <- pbp %>%
  filter(!is.na(epa)) %>%
  mutate(
    garbage_time = case_when(
      qtr == 4 & (wp < 0.10 | wp > 0.90) ~ TRUE,
      qtr == 4 & abs(score_differential) > 21 ~ TRUE,
      TRUE ~ FALSE
    )
  )

# Compare EPA with and without garbage time
comparison <- pbp_filtered %>%
  filter(play_type %in% c("pass", "run")) %>%
  group_by(posteam, garbage_time) %>%
  summarize(
    plays = n(),
    epa_per_play = mean(epa),
    .groups = "drop"
  ) %>%
  pivot_wider(names_from = garbage_time, values_from = c(plays, epa_per_play))

# Teams with biggest garbage time inflation
comparison %>%
  mutate(epa_diff = epa_per_play_TRUE - epa_per_play_FALSE) %>%
  arrange(desc(epa_diff)) %>%
  select(posteam, epa_per_play_FALSE, epa_per_play_TRUE, epa_diff)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter valid plays
plays = pbp[pbp["epa"].notna()].copy()

# Define garbage time
plays["garbage_time"] = (
    ((plays["qtr"] == 4) & ((plays["wp"] < 0.10) | (plays["wp"] > 0.90))) |
    ((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21))
)

# Compare with and without garbage time
plays_filtered = plays[plays["play_type"].isin(["pass", "run"])]

# EPA without garbage time
clean_epa = (plays_filtered[~plays_filtered["garbage_time"]]
    .groupby("posteam")["epa"].mean())

# EPA with garbage time
all_epa = plays_filtered.groupby("posteam")["epa"].mean()

# Compare
comparison = pd.DataFrame({
    "clean_epa": clean_epa,
    "all_epa": all_epa
})
comparison["garbage_inflation"] = comparison["all_epa"] - comparison["clean_epa"]
comparison = comparison.sort_values("garbage_inflation", ascending=False)

print("Garbage Time EPA Inflation:")
print(comparison.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas
EPA Stability Year-over-Year
Analyze how stable EPA metrics are from season to season.
Advanced
library(nflfastR)
library(tidyverse)

# Load two consecutive seasons
pbp_2022 <- load_pbp(2022)
pbp_2023 <- load_pbp(2023)

# Calculate team EPA for each season
get_team_epa <- function(pbp, season) {
  pbp %>%
    filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
    group_by(posteam) %>%
    summarize(
      epa_per_play = mean(epa),
      pass_epa = mean(epa[play_type == "pass"]),
      rush_epa = mean(epa[play_type == "run"]),
      .groups = "drop"
    ) %>%
    mutate(season = season)
}

epa_2022 <- get_team_epa(pbp_2022, 2022)
epa_2023 <- get_team_epa(pbp_2023, 2023)

# Join and calculate correlation
combined <- epa_2022 %>%
  inner_join(epa_2023, by = "posteam", suffix = c("_2022", "_2023"))

# Correlations
cat("Overall EPA correlation:", cor(combined$epa_per_play_2022, combined$epa_per_play_2023), "\n")
cat("Pass EPA correlation:", cor(combined$pass_epa_2022, combined$pass_epa_2023), "\n")
cat("Rush EPA correlation:", cor(combined$rush_epa_2022, combined$rush_epa_2023), "\n")
import nfl_data_py as nfl
import pandas as pd

# Load two consecutive seasons
pbp_2022 = nfl.import_pbp_data([2022])
pbp_2023 = nfl.import_pbp_data([2023])

def get_team_epa(pbp):
    plays = pbp[(pbp["epa"].notna()) &
                (pbp["play_type"].isin(["pass", "run"]))]
    return plays.groupby("posteam")["epa"].mean()

epa_2022 = get_team_epa(pbp_2022)
epa_2023 = get_team_epa(pbp_2023)

# Combine
combined = pd.DataFrame({
    "epa_2022": epa_2022,
    "epa_2023": epa_2023
}).dropna()

# Calculate correlation
correlation = combined["epa_2022"].corr(combined["epa_2023"])
print(f"Year-over-Year EPA Correlation: {correlation:.3f}")

# Show biggest changes
combined["change"] = combined["epa_2023"] - combined["epa_2022"]
print("\nBiggest Improvements:")
print(combined.nlargest(5, "change"))
print("\nBiggest Declines:")
print(combined.nsmallest(5, "change"))
Packages: nflfastR tidyverse nfl_data_py pandas
Explosive Play EPA Analysis
Analyze the impact of explosive plays (20+ yard gains) on team EPA.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Define explosive plays (20+ yards)
explosive <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  mutate(
    explosive = yards_gained >= 20,
    big_play = yards_gained >= 40
  )

# Team explosive play rates
team_explosive <- explosive %>%
  group_by(posteam) %>%
  summarize(
    total_plays = n(),
    explosive_plays = sum(explosive),
    explosive_rate = mean(explosive) * 100,
    explosive_epa = sum(epa[explosive]),
    non_explosive_epa = sum(epa[!explosive]),
    epa_from_explosive_pct = explosive_epa / sum(epa) * 100
  ) %>%
  arrange(desc(explosive_rate))

print(team_explosive)

# Explosive vs non-explosive EPA comparison
cat("\nAverage EPA on explosive plays:",
    mean(explosive$epa[explosive$explosive]), "\n")
cat("Average EPA on non-explosive plays:",
    mean(explosive$epa[!explosive$explosive]), "\n")
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
            (pbp["play_type"].isin(["pass", "run"]))].copy()

# Define explosive plays
plays["explosive"] = plays["yards_gained"] >= 20

# Team explosive play analysis
team_explosive = (plays.groupby("posteam")
    .agg(
        total_plays=("epa", "count"),
        explosive_plays=("explosive", "sum"),
        total_epa=("epa", "sum")
    )
    .reset_index())

team_explosive["explosive_rate"] = (team_explosive["explosive_plays"] /
                                     team_explosive["total_plays"] * 100)
team_explosive = team_explosive.sort_values("explosive_rate", ascending=False)

print("Team Explosive Play Rates:")
print(team_explosive)

# EPA comparison
explosive_epa = plays[plays["explosive"]]["epa"].mean()
non_explosive_epa = plays[~plays["explosive"]]["epa"].mean()
print(f"\nAverage EPA on explosive plays: {explosive_epa:.3f}")
print(f"Average EPA on non-explosive plays: {non_explosive_epa:.3f}")
Packages: nflfastR tidyverse nfl_data_py pandas
Quick Package Reference
R Packages
  • nflfastR - Play-by-play data with EPA
  • nflplotR - NFL team logos & plotting
  • tidyverse - Data manipulation & visualization
  • ggplot2 - Advanced visualizations
Python Packages
  • nfl_data_py - NFL data (nflverse compatible)
  • pandas - Data manipulation
  • matplotlib - Visualizations
  • scikit-learn - Machine learning

Ready to Dive Deeper?

Learn the theory behind these techniques in our comprehensive tutorial series

Browse Tutorials