Code Examples

Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.

122 Examples
R & Python Support: All examples include both R and Python versions. Click the tabs to switch between languages. Use the copy button to copy code to clipboard.

Data Loading

Load NFL play-by-play data using nflfastR, nfl_data_py, and various APIs

Load Play-by-Play Data (nflfastR)
Load NFL play-by-play data for one or multiple seasons using the nflfastR package.
Beginner
# Install and load nflfastR
# install.packages("nflfastR")
library(nflfastR)
library(tidyverse)

# Load single season
pbp_2023 <- load_pbp(2023)

# Load multiple seasons
pbp_multi <- load_pbp(2021:2023)

# View structure
glimpse(pbp_2023)

# Quick summary
cat("Total plays:", nrow(pbp_2023), "\n")
cat("Columns:", ncol(pbp_2023), "\n")
# Install: pip install nfl_data_py
import nfl_data_py as nfl
import pandas as pd

# Load single season
pbp_2023 = nfl.import_pbp_data([2023])

# Load multiple seasons
pbp_multi = nfl.import_pbp_data([2021, 2022, 2023])

# View structure
print(pbp_2023.info())

# Quick summary
print(f"Total plays: {len(pbp_2023)}")
print(f"Columns: {len(pbp_2023.columns)}")
Packages: nflfastR tidyverse nfl_data_py pandas
Load Roster Data
Get player roster information including positions, teams, and biographical data.
Beginner
library(nflfastR)
library(tidyverse)

# Load current rosters
rosters <- fast_scraper_roster(2023)

# Load historical rosters
rosters_multi <- fast_scraper_roster(2020:2023)

# Filter to specific position
qbs <- rosters %>%
  filter(position == "QB") %>%
  select(full_name, team, age, years_exp, draft_number)

head(qbs)
import nfl_data_py as nfl
import pandas as pd

# Load current rosters
rosters = nfl.import_rosters([2023])

# Load historical rosters
rosters_multi = nfl.import_rosters([2020, 2021, 2022, 2023])

# Filter to specific position
qbs = rosters[rosters["position"] == "QB"][
    ["player_name", "team", "age", "years_exp", "draft_number"]
]

print(qbs.head())
Packages: nflfastR tidyverse nfl_data_py pandas
Load Next Gen Stats
Access NFL Next Gen Stats data for advanced player tracking metrics.
Intermediate
library(nflfastR)
library(tidyverse)

# Load Next Gen Stats - Passing
ngs_passing <- load_nextgen_stats(
  seasons = 2023,
  stat_type = "passing"
)

# Load Next Gen Stats - Rushing
ngs_rushing <- load_nextgen_stats(
  seasons = 2023,
  stat_type = "rushing"
)

# Load Next Gen Stats - Receiving
ngs_receiving <- load_nextgen_stats(
  seasons = 2023,
  stat_type = "receiving"
)

# View top passers by avg air yards
ngs_passing %>%
  filter(week == 0) %>%  # Season totals
  arrange(desc(avg_air_yards_to_sticks)) %>%
  select(player_display_name, team_abbr, avg_air_yards_to_sticks,
         completion_percentage_above_expectation) %>%
  head(10)
import nfl_data_py as nfl
import pandas as pd

# Load Next Gen Stats - Passing
ngs_passing = nfl.import_ngs_data(
    stat_type="passing",
    years=[2023]
)

# Load Next Gen Stats - Rushing
ngs_rushing = nfl.import_ngs_data(
    stat_type="rushing",
    years=[2023]
)

# Load Next Gen Stats - Receiving
ngs_receiving = nfl.import_ngs_data(
    stat_type="receiving",
    years=[2023]
)

# View top passers by avg air yards (season totals)
season_totals = ngs_passing[ngs_passing["week"] == 0]
top_passers = season_totals.nlargest(10, "avg_air_yards_to_sticks")[
    ["player_display_name", "team_abbr", "avg_air_yards_to_sticks",
     "completion_percentage_above_expectation"]
]
print(top_passers)
Packages: nflfastR tidyverse nfl_data_py pandas
Load Combine Data
Access NFL Combine results for draft prospect evaluation.
Beginner
library(nflfastR)
library(tidyverse)

# Load combine data
combine <- load_combine()

# Filter recent years
combine_recent <- combine %>%
  filter(draft_year >= 2020)

# Top 40-yard dash times by position
combine_recent %>%
  filter(!is.na(forty)) %>%
  group_by(pos) %>%
  slice_min(forty, n = 3) %>%
  select(draft_year, player_name, pos, forty, vertical, broad_jump) %>%
  arrange(pos, forty)
import nfl_data_py as nfl
import pandas as pd

# Load combine data
combine = nfl.import_combine_data()

# Filter recent years
combine_recent = combine[combine["draft_year"] >= 2020]

# Top 40-yard dash times by position
top_forty = (combine_recent[combine_recent["forty"].notna()]
    .groupby("pos")
    .apply(lambda x: x.nsmallest(3, "forty"))
    .reset_index(drop=True)
    [["draft_year", "player_name", "pos", "forty", "vertical", "broad_jump"]]
)
print(top_forty)
Packages: nflfastR tidyverse nfl_data_py pandas
Load Schedule and Game Results
Get NFL schedules, game results, and betting lines.
Beginner
library(nflfastR)
library(tidyverse)

# Load schedules with results
schedules <- load_schedules(2020:2023)

# Filter to completed games
completed <- schedules %>%
  filter(!is.na(result))

# Calculate home win percentage
home_win_pct <- completed %>%
  summarize(
    home_wins = sum(result > 0),
    away_wins = sum(result < 0),
    ties = sum(result == 0),
    home_win_pct = home_wins / n()
  )

print(home_win_pct)

# Spread analysis
schedules %>%
  filter(!is.na(spread_line), !is.na(result)) %>%
  mutate(
    home_covered = result + spread_line > 0,
    away_covered = result + spread_line < 0
  ) %>%
  summarize(
    home_cover_rate = mean(home_covered, na.rm = TRUE),
    avg_spread = mean(spread_line),
    avg_margin = mean(abs(result))
  )
import nfl_data_py as nfl
import pandas as pd

# Load schedules with results
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])

# Filter to completed games
completed = schedules[schedules["result"].notna()]

# Calculate home win percentage
home_wins = (completed["result"] > 0).sum()
away_wins = (completed["result"] < 0).sum()
ties = (completed["result"] == 0).sum()
home_win_pct = home_wins / len(completed)

print(f"Home Win %: {home_win_pct:.1%}")

# Spread analysis
with_spread = completed[completed["spread_line"].notna()]
with_spread["home_covered"] = with_spread["result"] + with_spread["spread_line"] > 0

print(f"Home Cover Rate: {with_spread['home_covered'].mean():.1%}")
print(f"Avg Spread: {with_spread['spread_line'].mean():.1f}")
print(f"Avg Margin: {with_spread['result'].abs().mean():.1f}")
Packages: nflfastR tidyverse nfl_data_py pandas
Load Player Participation Data
Get play-level player participation data showing which players were on the field.
Intermediate
library(nflfastR)
library(tidyverse)

# Load participation data
participation <- load_participation(2023)

# View structure
glimpse(participation)

# Analyze snap counts by player
snap_counts <- participation %>%
  separate_rows(offense_players, sep = ";") %>%
  filter(offense_players != "") %>%
  group_by(offense_players) %>%
  summarize(
    snaps = n(),
    games = n_distinct(game_id)
  ) %>%
  arrange(desc(snaps))

head(snap_counts, 20)
import nfl_data_py as nfl
import pandas as pd

# Load participation data
participation = nfl.import_participation([2023])

# View structure
print(participation.info())

# Count offensive snaps (simplified)
# Note: participation data structure varies
print(f"Total plays with participation: {len(participation)}")
print(participation.head())
Packages: nflfastR tidyverse nfl_data_py pandas
Load Injuries Data
Access weekly injury reports and player injury history.
Beginner
library(nflfastR)
library(tidyverse)

# Load injuries data
injuries <- load_injuries(2023)

# View structure
glimpse(injuries)

# Count injuries by team and status
injury_summary <- injuries %>%
  group_by(team, report_status) %>%
  summarize(players = n(), .groups = "drop") %>%
  pivot_wider(names_from = report_status, values_from = players, values_fill = 0)

print(injury_summary)

# Find most common injury types
injuries %>%
  filter(!is.na(report_primary_injury)) %>%
  count(report_primary_injury, sort = TRUE) %>%
  head(10)
import nfl_data_py as nfl
import pandas as pd

# Load injuries data
injuries = nfl.import_injuries([2023])

# View structure
print(injuries.info())

# Count injuries by status
if "report_status" in injuries.columns:
    status_counts = injuries["report_status"].value_counts()
    print("Injury Status Distribution:")
    print(status_counts)

# Most common injuries
if "report_primary_injury" in injuries.columns:
    injury_types = injuries["report_primary_injury"].value_counts().head(10)
    print("\nMost Common Injuries:")
    print(injury_types)
Packages: nflfastR tidyverse nfl_data_py pandas
Load Draft Picks Historical
Access historical NFL Draft data for prospect analysis.
Beginner
library(nflfastR)
library(tidyverse)

# Load draft picks
draft <- load_draft_picks()

# Filter to recent years
recent_draft <- draft %>%
  filter(season >= 2020)

# Analyze picks by position
position_analysis <- recent_draft %>%
  group_by(season, position) %>%
  summarize(
    picks = n(),
    avg_pick = mean(pick),
    .groups = "drop"
  )

# First round by position
first_round <- recent_draft %>%
  filter(round == 1) %>%
  count(position, sort = TRUE)

print(first_round)

# Top 10 picks each year
recent_draft %>%
  filter(pick <= 10) %>%
  select(season, pick, team, pfr_player_name, position) %>%
  arrange(season, pick)
import nfl_data_py as nfl
import pandas as pd

# Load draft picks
draft = nfl.import_draft_picks()

# Filter to recent years
recent_draft = draft[draft["season"] >= 2020]

# First round picks by position
first_round = recent_draft[recent_draft["round"] == 1]
position_counts = first_round["position"].value_counts()
print("First Round Picks by Position (2020+):")
print(position_counts)

# Top 10 picks
top_picks = recent_draft[recent_draft["pick"] <= 10][
    ["season", "pick", "team", "pfr_player_name", "position"]
].sort_values(["season", "pick"])
print("\nTop 10 Picks:")
print(top_picks)
Packages: nflfastR tidyverse nfl_data_py pandas
Load Snap Counts
Get weekly snap count data for all players.
Beginner
library(nflfastR)
library(tidyverse)

# Load snap counts
snaps <- load_snap_counts(2023)

# View structure
glimpse(snaps)

# Top offensive snap leaders
off_snaps <- snaps %>%
  group_by(player, position, team) %>%
  summarize(
    games = n(),
    total_off_snaps = sum(offense_snaps, na.rm = TRUE),
    avg_off_snaps = mean(offense_snaps, na.rm = TRUE),
    avg_off_pct = mean(offense_pct, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(games >= 10) %>%
  arrange(desc(total_off_snaps))

# Top snap share by position
snaps %>%
  filter(position %in% c("QB", "RB", "WR", "TE")) %>%
  group_by(position) %>%
  slice_max(offense_pct, n = 5) %>%
  select(player, team, position, week, offense_snaps, offense_pct)
import nfl_data_py as nfl
import pandas as pd

# Load snap counts
snaps = nfl.import_snap_counts([2023])

# View structure
print(snaps.info())

# Top offensive snap leaders
off_snaps = (snaps.groupby(["player", "position", "team"])
    .agg(
        games=("week", "count"),
        total_off_snaps=("offense_snaps", "sum"),
        avg_off_snaps=("offense_snaps", "mean"),
        avg_off_pct=("offense_pct", "mean")
    )
    .reset_index())

off_snaps = off_snaps[off_snaps["games"] >= 10].sort_values(
    "total_off_snaps", ascending=False)

print("Top Offensive Snap Leaders:")
print(off_snaps.head(20))
Packages: nflfastR tidyverse nfl_data_py pandas
Load Win Totals and Futures
Access preseason win total lines for betting analysis.
Intermediate
library(nflfastR)
library(tidyverse)

# Load schedules for win totals context
schedules <- load_schedules(2023)

# Calculate actual wins per team
actual_wins <- schedules %>%
  filter(!is.na(result)) %>%
  mutate(
    home_win = result > 0,
    away_win = result < 0
  ) %>%
  pivot_longer(
    cols = c(home_team, away_team),
    names_to = "location",
    values_to = "team"
  ) %>%
  mutate(
    win = case_when(
      location == "home_team" & result > 0 ~ 1,
      location == "away_team" & result < 0 ~ 1,
      result == 0 ~ 0.5,
      TRUE ~ 0
    )
  ) %>%
  group_by(team) %>%
  summarize(
    games = n(),
    wins = sum(win),
    losses = games - wins
  ) %>%
  arrange(desc(wins))

print(actual_wins)
import nfl_data_py as nfl
import pandas as pd

# Load schedules
schedules = nfl.import_schedules([2023])

# Filter to completed games
completed = schedules[schedules["result"].notna()]

# Calculate wins for home teams
home_wins = completed.groupby("home_team").apply(
    lambda x: (x["result"] > 0).sum() + (x["result"] == 0).sum() * 0.5
).reset_index(name="home_wins")

# Calculate wins for away teams
away_wins = completed.groupby("away_team").apply(
    lambda x: (x["result"] < 0).sum() + (x["result"] == 0).sum() * 0.5
).reset_index(name="away_wins")

# Combine
wins = home_wins.merge(away_wins, left_on="home_team", right_on="away_team")
wins["total_wins"] = wins["home_wins"] + wins["away_wins"]
wins = wins[["home_team", "total_wins"]].rename(columns={"home_team": "team"})
wins = wins.sort_values("total_wins", ascending=False)

print("Team Win Totals:")
print(wins)
Packages: nflfastR tidyverse nfl_data_py pandas
Load and Merge Multiple Data Sources
Combine play-by-play data with roster and participation data.
Advanced
library(nflfastR)
library(tidyverse)

# Load multiple data sources
pbp <- load_pbp(2023)
rosters <- fast_scraper_roster(2023)

# Get passer stats with roster info
passer_stats <- pbp %>%
  filter(!is.na(passer_player_id)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    attempts = n(),
    completions = sum(complete_pass),
    yards = sum(passing_yards, na.rm = TRUE),
    tds = sum(pass_touchdown),
    ints = sum(interception),
    epa = sum(epa, na.rm = TRUE),
    .groups = "drop"
  )

# Merge with roster data
passer_with_roster <- passer_stats %>%
  left_join(
    rosters %>% select(gsis_id, full_name, team, age, height, weight, college),
    by = c("passer_player_id" = "gsis_id")
  )

# View combined data
passer_with_roster %>%
  filter(attempts >= 100) %>%
  select(passer_player_name, team, age, college, attempts, yards, tds, epa) %>%
  arrange(desc(epa))
import nfl_data_py as nfl
import pandas as pd

# Load multiple data sources
pbp = nfl.import_pbp_data([2023])
rosters = nfl.import_rosters([2023])

# Get passer stats
passer_stats = (pbp[pbp["passer_player_id"].notna()]
    .groupby(["passer_player_id", "passer_player_name"])
    .agg(
        attempts=("play_id", "count"),
        completions=("complete_pass", "sum"),
        yards=("passing_yards", "sum"),
        tds=("pass_touchdown", "sum"),
        ints=("interception", "sum"),
        epa=("epa", "sum")
    )
    .reset_index())

# Merge with roster data
passer_with_roster = passer_stats.merge(
    rosters[["gsis_id", "player_name", "team", "age", "college"]],
    left_on="passer_player_id",
    right_on="gsis_id",
    how="left"
)

# View combined data
result = (passer_with_roster[passer_with_roster["attempts"] >= 100]
    .sort_values("epa", ascending=False)
    [["passer_player_name", "team", "age", "college", "attempts", "yards", "tds", "epa"]])
print(result)
Packages: nflfastR tidyverse nfl_data_py pandas
Load QBR and Advanced Passing Metrics
Access ESPN QBR and other advanced quarterback metrics.
Intermediate
library(nflfastR)
library(tidyverse)

# Load QBR data
qbr <- load_espn_qbr(
  league = "nfl",
  seasons = 2023,
  summary_type = "season"
)

# View structure
glimpse(qbr)

# Top QBs by Total QBR
qbr %>%
  arrange(desc(qbr_total)) %>%
  select(name_display, team_abb, qbr_total, pts_added, pass_rating) %>%
  head(15)

# Weekly QBR trends
qbr_weekly <- load_espn_qbr(
  league = "nfl",
  seasons = 2023,
  summary_type = "week"
)

# Best single-game performances
qbr_weekly %>%
  arrange(desc(qbr_total)) %>%
  select(name_display, week, team_abb, qbr_total, pts_added) %>%
  head(10)
import nfl_data_py as nfl
import pandas as pd

# Load QBR data
qbr = nfl.import_qbr([2023])

# View structure
print(qbr.info())

# Top QBs by QBR (if available)
if "qbr_total" in qbr.columns:
    top_qbr = qbr.nlargest(15, "qbr_total")[
        ["name_display", "team_abb", "qbr_total", "pts_added", "pass_rating"]
    ]
    print("Top QBs by Total QBR:")
    print(top_qbr)
else:
    print("Available columns:", qbr.columns.tolist())
    print(qbr.head())
Packages: nflfastR tidyverse nfl_data_py pandas

EPA Analysis

Calculate and analyze Expected Points Added for teams, players, and situations

Calculate Team EPA per Play
Compute offensive and defensive EPA per play rankings for all teams.
Intermediate
library(nflfastR)
library(tidyverse)

# Load data
pbp <- load_pbp(2023)

# Filter to pass and rush plays
plays <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run"))

# Calculate offensive EPA/play
offense_epa <- plays %>%
  group_by(posteam) %>%
  summarize(
    plays = n(),
    total_epa = sum(epa),
    epa_per_play = mean(epa),
    pass_epa = mean(epa[play_type == "pass"]),
    rush_epa = mean(epa[play_type == "run"]),
    success_rate = mean(success)
  ) %>%
  arrange(desc(epa_per_play))

# Calculate defensive EPA/play (lower is better)
defense_epa <- plays %>%
  group_by(defteam) %>%
  summarize(
    plays = n(),
    epa_allowed = mean(epa),
    pass_epa_allowed = mean(epa[play_type == "pass"]),
    rush_epa_allowed = mean(epa[play_type == "run"])
  ) %>%
  arrange(epa_allowed)

print(offense_epa)
print(defense_epa)
import nfl_data_py as nfl
import pandas as pd

# Load data
pbp = nfl.import_pbp_data([2023])

# Filter to pass and rush plays
plays = pbp[(pbp["epa"].notna()) &
            (pbp["play_type"].isin(["pass", "run"]))]

# Calculate offensive EPA/play
offense_epa = (plays.groupby("posteam")
    .agg(
        plays=("epa", "count"),
        total_epa=("epa", "sum"),
        epa_per_play=("epa", "mean"),
        success_rate=("success", "mean")
    )
    .sort_values("epa_per_play", ascending=False)
    .reset_index())

# Calculate defensive EPA/play
defense_epa = (plays.groupby("defteam")
    .agg(epa_allowed=("epa", "mean"))
    .sort_values("epa_allowed")
    .reset_index())

print("Offensive EPA/Play:")
print(offense_epa.head(10))
print("\nDefensive EPA/Play:")
print(defense_epa.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas
Quarterback EPA Rankings
Rank quarterbacks by EPA per dropback with minimum attempt filters.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# QB EPA analysis
qb_epa <- pbp %>%
  filter(!is.na(epa), !is.na(passer_player_id)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    dropbacks = n(),
    total_epa = sum(epa),
    epa_per_dropback = mean(epa),
    cpoe = mean(cpoe, na.rm = TRUE),
    success_rate = mean(success),
    avg_air_yards = mean(air_yards, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(dropbacks >= 200) %>%
  arrange(desc(epa_per_dropback))

# Add rank
qb_epa <- qb_epa %>%
  mutate(rank = row_number())

print(qb_epa, n = 20)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter to pass plays with QB identified
pass_plays = pbp[(pbp["epa"].notna()) &
                 (pbp["passer_player_id"].notna())]

# QB EPA analysis
qb_epa = (pass_plays.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        dropbacks=("epa", "count"),
        total_epa=("epa", "sum"),
        epa_per_dropback=("epa", "mean"),
        cpoe=("cpoe", "mean"),
        success_rate=("success", "mean"),
        avg_air_yards=("air_yards", "mean")
    )
    .reset_index())

# Filter min dropbacks and rank
qb_epa = (qb_epa[qb_epa["dropbacks"] >= 200]
    .sort_values("epa_per_dropback", ascending=False)
    .reset_index(drop=True))
qb_epa["rank"] = range(1, len(qb_epa) + 1)

print(qb_epa.head(20))
Packages: nflfastR tidyverse nfl_data_py pandas
EPA by Down and Distance
Analyze expected points by game situation (down, distance, field position).
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# EPA by down
epa_by_down <- pbp %>%
  filter(!is.na(epa), down <= 4) %>%
  group_by(down) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    pass_rate = mean(play_type == "pass", na.rm = TRUE),
    success_rate = mean(success)
  )

print(epa_by_down)

# EPA by distance buckets
epa_by_distance <- pbp %>%
  filter(!is.na(epa), down == 1) %>%
  mutate(
    distance_bucket = case_when(
      ydstogo <= 3 ~ "1-3 yards",
      ydstogo <= 7 ~ "4-7 yards",
      ydstogo <= 10 ~ "8-10 yards",
      TRUE ~ "11+ yards"
    )
  ) %>%
  group_by(distance_bucket) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    success_rate = mean(success)
  )

# EPA by field position
epa_by_field <- pbp %>%
  filter(!is.na(epa)) %>%
  mutate(
    field_zone = case_when(
      yardline_100 >= 80 ~ "Own 1-20",
      yardline_100 >= 60 ~ "Own 21-40",
      yardline_100 >= 40 ~ "Midfield",
      yardline_100 >= 20 ~ "Opp 21-40",
      TRUE ~ "Red Zone"
    )
  ) %>%
  group_by(field_zone) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    td_rate = mean(touchdown, na.rm = TRUE)
  )
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# EPA by down
plays = pbp[(pbp["epa"].notna()) & (pbp["down"] <= 4)]
epa_by_down = (plays.groupby("down")
    .agg(
        plays=("epa", "count"),
        avg_epa=("epa", "mean"),
        success_rate=("success", "mean")
    )
    .reset_index())
print("EPA by Down:")
print(epa_by_down)

# EPA by field position
def get_field_zone(yard):
    if yard >= 80: return "Own 1-20"
    elif yard >= 60: return "Own 21-40"
    elif yard >= 40: return "Midfield"
    elif yard >= 20: return "Opp 21-40"
    else: return "Red Zone"

plays["field_zone"] = plays["yardline_100"].apply(get_field_zone)
epa_by_field = (plays.groupby("field_zone")
    .agg(
        plays=("epa", "count"),
        avg_epa=("epa", "mean"),
        td_rate=("touchdown", "mean")
    )
    .reset_index())
print("\nEPA by Field Position:")
print(epa_by_field)
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Rolling EPA Trends
Calculate rolling EPA averages to track team performance over time.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Game-level EPA
game_epa <- pbp %>%
  filter(!is.na(epa)) %>%
  group_by(game_id, posteam, week) %>%
  summarize(
    epa_per_play = mean(epa),
    .groups = "drop"
  )

# Calculate 4-game rolling average
rolling_epa <- game_epa %>%
  arrange(posteam, week) %>%
  group_by(posteam) %>%
  mutate(
    rolling_4_epa = zoo::rollmean(epa_per_play, k = 4,
                                   fill = NA, align = "right")
  ) %>%
  ungroup()

# Find teams that improved most
improvement <- rolling_epa %>%
  filter(week %in% c(5, 17)) %>%  # After 4 games vs end
  select(posteam, week, rolling_4_epa) %>%
  pivot_wider(names_from = week, values_from = rolling_4_epa,
              names_prefix = "week_") %>%
  mutate(improvement = week_17 - week_5) %>%
  arrange(desc(improvement))

print(improvement)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Game-level EPA
game_epa = (pbp[pbp["epa"].notna()]
    .groupby(["game_id", "posteam", "week"])
    .agg(epa_per_play=("epa", "mean"))
    .reset_index())

# Calculate 4-game rolling average
game_epa = game_epa.sort_values(["posteam", "week"])
game_epa["rolling_4_epa"] = (game_epa.groupby("posteam")
    ["epa_per_play"].transform(lambda x: x.rolling(4).mean()))

# Find teams that improved most
early = game_epa[game_epa["week"] == 5][["posteam", "rolling_4_epa"]]
late = game_epa[game_epa["week"] == 17][["posteam", "rolling_4_epa"]]

improvement = early.merge(late, on="posteam", suffixes=("_early", "_late"))
improvement["change"] = improvement["rolling_4_epa_late"] - improvement["rolling_4_epa_early"]
improvement = improvement.sort_values("change", ascending=False)

print(improvement)
Packages: nflfastR tidyverse zoo nfl_data_py pandas
EPA by Play Type and Quarter
Analyze how EPA varies by play type across different game quarters.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# EPA by quarter and play type
epa_by_qtr <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run"), qtr <= 4) %>%
  group_by(qtr, play_type) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    success_rate = mean(success),
    .groups = "drop"
  )

print(epa_by_qtr)

# Pass vs Run EPA advantage by quarter
epa_by_qtr %>%
  pivot_wider(names_from = play_type, values_from = c(avg_epa, success_rate)) %>%
  mutate(pass_advantage = avg_epa_pass - avg_epa_run)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
            (pbp["play_type"].isin(["pass", "run"])) &
            (pbp["qtr"] <= 4)]

# EPA by quarter and play type
epa_by_qtr = (plays.groupby(["qtr", "play_type"])
    .agg(
        plays=("epa", "count"),
        avg_epa=("epa", "mean"),
        success_rate=("success", "mean")
    )
    .reset_index())

print("EPA by Quarter and Play Type:")
print(epa_by_qtr)

# Pass vs Run comparison
pivot = epa_by_qtr.pivot(index="qtr", columns="play_type", values="avg_epa")
pivot["pass_advantage"] = pivot["pass"] - pivot["run"]
print("\nPass EPA Advantage by Quarter:")
print(pivot)
Packages: nflfastR tidyverse nfl_data_py pandas
Red Zone EPA Analysis
Evaluate team and player efficiency inside the opponent's 20-yard line.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Red zone analysis (inside opponent 20)
red_zone <- pbp %>%
  filter(!is.na(epa), yardline_100 <= 20, play_type %in% c("pass", "run"))

# Team red zone efficiency
team_rz <- red_zone %>%
  group_by(posteam) %>%
  summarize(
    rz_plays = n(),
    rz_epa = mean(epa),
    rz_td_rate = mean(touchdown, na.rm = TRUE),
    rz_success_rate = mean(success),
    pass_rate = mean(play_type == "pass")
  ) %>%
  arrange(desc(rz_td_rate))

print(team_rz)

# Red zone vs rest of field comparison
rbind(
  red_zone %>% summarize(zone = "Red Zone", epa = mean(epa), success = mean(success)),
  pbp %>% filter(yardline_100 > 20, !is.na(epa)) %>%
    summarize(zone = "Outside RZ", epa = mean(epa), success = mean(success))
)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Red zone plays (inside opponent 20)
red_zone = pbp[(pbp["epa"].notna()) &
               (pbp["yardline_100"] <= 20) &
               (pbp["play_type"].isin(["pass", "run"]))]

# Team red zone efficiency
team_rz = (red_zone.groupby("posteam")
    .agg(
        rz_plays=("epa", "count"),
        rz_epa=("epa", "mean"),
        rz_td_rate=("touchdown", "mean"),
        rz_success_rate=("success", "mean")
    )
    .reset_index()
    .sort_values("rz_td_rate", ascending=False))

print("Team Red Zone Efficiency:")
print(team_rz)

# Red zone vs outside comparison
outside_rz = pbp[(pbp["epa"].notna()) & (pbp["yardline_100"] > 20)]
print(f"\nRed Zone EPA: {red_zone['epa'].mean():.3f}")
print(f"Outside RZ EPA: {outside_rz['epa'].mean():.3f}")
Packages: nflfastR tidyverse nfl_data_py pandas
Third Down EPA Breakdown
Analyze third down conversion efficiency using EPA metrics.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Third down analysis
third_down <- pbp %>%
  filter(!is.na(epa), down == 3)

# By distance bucket
third_by_dist <- third_down %>%
  mutate(
    dist_bucket = case_when(
      ydstogo <= 3 ~ "Short (1-3)",
      ydstogo <= 6 ~ "Medium (4-6)",
      ydstogo <= 10 ~ "Long (7-10)",
      TRUE ~ "Very Long (11+)"
    )
  ) %>%
  group_by(dist_bucket) %>%
  summarize(
    attempts = n(),
    conversion_rate = mean(first_down, na.rm = TRUE),
    avg_epa = mean(epa),
    pass_rate = mean(play_type == "pass", na.rm = TRUE)
  )

print(third_by_dist)

# Team third down rankings
team_3rd <- third_down %>%
  group_by(posteam) %>%
  summarize(
    attempts = n(),
    conversions = sum(first_down, na.rm = TRUE),
    conv_rate = mean(first_down, na.rm = TRUE),
    epa_per_play = mean(epa)
  ) %>%
  arrange(desc(conv_rate))

print(team_3rd)
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Third down plays
third_down = pbp[(pbp["epa"].notna()) & (pbp["down"] == 3)]

# Distance buckets
def dist_bucket(ydstogo):
    if ydstogo <= 3: return "Short (1-3)"
    elif ydstogo <= 6: return "Medium (4-6)"
    elif ydstogo <= 10: return "Long (7-10)"
    else: return "Very Long (11+)"

third_down["dist_bucket"] = third_down["ydstogo"].apply(dist_bucket)

# Analysis by distance
third_by_dist = (third_down.groupby("dist_bucket")
    .agg(
        attempts=("epa", "count"),
        conversion_rate=("first_down", "mean"),
        avg_epa=("epa", "mean")
    )
    .reset_index())

print("Third Down by Distance:")
print(third_by_dist)

# Team rankings
team_3rd = (third_down.groupby("posteam")
    .agg(
        attempts=("epa", "count"),
        conv_rate=("first_down", "mean"),
        epa_per_play=("epa", "mean")
    )
    .reset_index()
    .sort_values("conv_rate", ascending=False))

print("\nTeam Third Down Rankings:")
print(team_3rd.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Garbage Time Filtering
Filter out garbage time plays to get more meaningful EPA analysis.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Define garbage time (various definitions)
# Common: Win prob < 10% or > 90% in 4th quarter
pbp_filtered <- pbp %>%
  filter(!is.na(epa)) %>%
  mutate(
    garbage_time = case_when(
      qtr == 4 & (wp < 0.10 | wp > 0.90) ~ TRUE,
      qtr == 4 & abs(score_differential) > 21 ~ TRUE,
      TRUE ~ FALSE
    )
  )

# Compare EPA with and without garbage time
comparison <- pbp_filtered %>%
  filter(play_type %in% c("pass", "run")) %>%
  group_by(posteam, garbage_time) %>%
  summarize(
    plays = n(),
    epa_per_play = mean(epa),
    .groups = "drop"
  ) %>%
  pivot_wider(names_from = garbage_time, values_from = c(plays, epa_per_play))

# Teams with biggest garbage time inflation
comparison %>%
  mutate(epa_diff = epa_per_play_TRUE - epa_per_play_FALSE) %>%
  arrange(desc(epa_diff)) %>%
  select(posteam, epa_per_play_FALSE, epa_per_play_TRUE, epa_diff)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter valid plays
plays = pbp[pbp["epa"].notna()].copy()

# Define garbage time
plays["garbage_time"] = (
    ((plays["qtr"] == 4) & ((plays["wp"] < 0.10) | (plays["wp"] > 0.90))) |
    ((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21))
)

# Compare with and without garbage time
plays_filtered = plays[plays["play_type"].isin(["pass", "run"])]

# EPA without garbage time
clean_epa = (plays_filtered[~plays_filtered["garbage_time"]]
    .groupby("posteam")["epa"].mean())

# EPA with garbage time
all_epa = plays_filtered.groupby("posteam")["epa"].mean()

# Compare
comparison = pd.DataFrame({
    "clean_epa": clean_epa,
    "all_epa": all_epa
})
comparison["garbage_inflation"] = comparison["all_epa"] - comparison["clean_epa"]
comparison = comparison.sort_values("garbage_inflation", ascending=False)

print("Garbage Time EPA Inflation:")
print(comparison.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas
EPA Stability Year-over-Year
Analyze how stable EPA metrics are from season to season.
Advanced
library(nflfastR)
library(tidyverse)

# Load two consecutive seasons
pbp_2022 <- load_pbp(2022)
pbp_2023 <- load_pbp(2023)

# Calculate team EPA for each season
get_team_epa <- function(pbp, season) {
  pbp %>%
    filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
    group_by(posteam) %>%
    summarize(
      epa_per_play = mean(epa),
      pass_epa = mean(epa[play_type == "pass"]),
      rush_epa = mean(epa[play_type == "run"]),
      .groups = "drop"
    ) %>%
    mutate(season = season)
}

epa_2022 <- get_team_epa(pbp_2022, 2022)
epa_2023 <- get_team_epa(pbp_2023, 2023)

# Join and calculate correlation
combined <- epa_2022 %>%
  inner_join(epa_2023, by = "posteam", suffix = c("_2022", "_2023"))

# Correlations
cat("Overall EPA correlation:", cor(combined$epa_per_play_2022, combined$epa_per_play_2023), "\n")
cat("Pass EPA correlation:", cor(combined$pass_epa_2022, combined$pass_epa_2023), "\n")
cat("Rush EPA correlation:", cor(combined$rush_epa_2022, combined$rush_epa_2023), "\n")
import nfl_data_py as nfl
import pandas as pd

# Load two consecutive seasons
pbp_2022 = nfl.import_pbp_data([2022])
pbp_2023 = nfl.import_pbp_data([2023])

def get_team_epa(pbp):
    plays = pbp[(pbp["epa"].notna()) &
                (pbp["play_type"].isin(["pass", "run"]))]
    return plays.groupby("posteam")["epa"].mean()

epa_2022 = get_team_epa(pbp_2022)
epa_2023 = get_team_epa(pbp_2023)

# Combine
combined = pd.DataFrame({
    "epa_2022": epa_2022,
    "epa_2023": epa_2023
}).dropna()

# Calculate correlation
correlation = combined["epa_2022"].corr(combined["epa_2023"])
print(f"Year-over-Year EPA Correlation: {correlation:.3f}")

# Show biggest changes
combined["change"] = combined["epa_2023"] - combined["epa_2022"]
print("\nBiggest Improvements:")
print(combined.nlargest(5, "change"))
print("\nBiggest Declines:")
print(combined.nsmallest(5, "change"))
Packages: nflfastR tidyverse nfl_data_py pandas
Explosive Play EPA Analysis
Analyze the impact of explosive plays (20+ yard gains) on team EPA.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Define explosive plays (20+ yards)
explosive <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  mutate(
    explosive = yards_gained >= 20,
    big_play = yards_gained >= 40
  )

# Team explosive play rates
team_explosive <- explosive %>%
  group_by(posteam) %>%
  summarize(
    total_plays = n(),
    explosive_plays = sum(explosive),
    explosive_rate = mean(explosive) * 100,
    explosive_epa = sum(epa[explosive]),
    non_explosive_epa = sum(epa[!explosive]),
    epa_from_explosive_pct = explosive_epa / sum(epa) * 100
  ) %>%
  arrange(desc(explosive_rate))

print(team_explosive)

# Explosive vs non-explosive EPA comparison
cat("\nAverage EPA on explosive plays:",
    mean(explosive$epa[explosive$explosive]), "\n")
cat("Average EPA on non-explosive plays:",
    mean(explosive$epa[!explosive$explosive]), "\n")
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
            (pbp["play_type"].isin(["pass", "run"]))].copy()

# Define explosive plays
plays["explosive"] = plays["yards_gained"] >= 20

# Team explosive play analysis
team_explosive = (plays.groupby("posteam")
    .agg(
        total_plays=("epa", "count"),
        explosive_plays=("explosive", "sum"),
        total_epa=("epa", "sum")
    )
    .reset_index())

team_explosive["explosive_rate"] = (team_explosive["explosive_plays"] /
                                     team_explosive["total_plays"] * 100)
team_explosive = team_explosive.sort_values("explosive_rate", ascending=False)

print("Team Explosive Play Rates:")
print(team_explosive)

# EPA comparison
explosive_epa = plays[plays["explosive"]]["epa"].mean()
non_explosive_epa = plays[~plays["explosive"]]["epa"].mean()
print(f"\nAverage EPA on explosive plays: {explosive_epa:.3f}")
print(f"Average EPA on non-explosive plays: {non_explosive_epa:.3f}")
Packages: nflfastR tidyverse nfl_data_py pandas

Win Probability

Analyze win probability, WPA, and clutch performance metrics

Basic Win Probability Lookup
Access and understand win probability values in play-by-play data.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Win probability is already calculated in pbp data
# Key columns: wp, vegas_wp, home_wp, away_wp

# View WP at key moments
key_plays <- pbp %>%
  filter(!is.na(wp), !is.na(desc)) %>%
  select(game_id, qtr, time, posteam, desc, wp, wpa) %>%
  head(20)

print(key_plays)

# Average WP by score differential
wp_by_score <- pbp %>%
  filter(!is.na(wp), !is.na(score_differential)) %>%
  group_by(score_differential) %>%
  summarize(
    avg_wp = mean(wp),
    plays = n()
  ) %>%
  filter(plays >= 100, abs(score_differential) <= 21)

print(wp_by_score)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Win probability columns: wp, vegas_wp, home_wp, away_wp
# View key WP columns
wp_cols = ["game_id", "qtr", "time", "posteam", "wp", "wpa"]
key_plays = pbp[pbp["wp"].notna()][wp_cols].head(20)
print("Sample Win Probability Data:")
print(key_plays)

# Average WP by score differential
wp_by_score = (pbp[pbp["wp"].notna() & pbp["score_differential"].notna()]
    .groupby("score_differential")
    .agg(avg_wp=("wp", "mean"), plays=("wp", "count"))
    .reset_index())

wp_by_score = wp_by_score[
    (wp_by_score["plays"] >= 100) &
    (wp_by_score["score_differential"].abs() <= 21)
]
print("\nAverage WP by Score Differential:")
print(wp_by_score)
Packages: nflfastR tidyverse nfl_data_py pandas
Win Probability Added (WPA) Leaders
Find players who contributed most to their teams win probability.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# QB WPA leaders
qb_wpa <- pbp %>%
  filter(!is.na(wpa), !is.na(passer_player_id)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    dropbacks = n(),
    total_wpa = sum(wpa),
    positive_plays = sum(wpa > 0),
    negative_plays = sum(wpa < 0),
    .groups = "drop"
  ) %>%
  filter(dropbacks >= 200) %>%
  arrange(desc(total_wpa))

print(qb_wpa)

# Single biggest WPA plays of the season
pbp %>%
  filter(!is.na(wpa)) %>%
  arrange(desc(abs(wpa))) %>%
  select(game_id, posteam, qtr, desc, wpa) %>%
  head(10)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# QB WPA leaders
qb_plays = pbp[pbp["wpa"].notna() & pbp["passer_player_id"].notna()]
qb_wpa = (qb_plays.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        dropbacks=("wpa", "count"),
        total_wpa=("wpa", "sum"),
        positive_plays=("wpa", lambda x: (x > 0).sum()),
        negative_plays=("wpa", lambda x: (x < 0).sum())
    )
    .reset_index())

qb_wpa = qb_wpa[qb_wpa["dropbacks"] >= 200].sort_values("total_wpa", ascending=False)
print("QB WPA Leaders:")
print(qb_wpa)

# Biggest WPA plays
biggest_plays = (pbp[pbp["wpa"].notna()]
    .assign(abs_wpa=lambda x: x["wpa"].abs())
    .nlargest(10, "abs_wpa")
    [["game_id", "posteam", "qtr", "desc", "wpa"]])
print("\nBiggest WPA Plays:")
print(biggest_plays)
Packages: nflfastR tidyverse nfl_data_py pandas
Clutch Performance Analysis
Identify players who perform best in high-leverage situations.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Define clutch situations: 4th quarter, WP between 25-75%
clutch_plays <- pbp %>%
  filter(
    !is.na(epa),
    qtr == 4,
    wp >= 0.25 & wp <= 0.75
  )

# QB clutch performance
qb_clutch <- clutch_plays %>%
  filter(!is.na(passer_player_id)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    clutch_plays = n(),
    clutch_epa = sum(epa),
    clutch_wpa = sum(wpa, na.rm = TRUE),
    clutch_success = mean(success),
    .groups = "drop"
  ) %>%
  filter(clutch_plays >= 30) %>%
  arrange(desc(clutch_wpa))

print(qb_clutch)

# Compare clutch vs non-clutch
pbp %>%
  filter(!is.na(passer_player_id), !is.na(epa)) %>%
  mutate(
    is_clutch = qtr == 4 & wp >= 0.25 & wp <= 0.75
  ) %>%
  group_by(passer_player_name, is_clutch) %>%
  summarize(epa_per_play = mean(epa), .groups = "drop") %>%
  pivot_wider(names_from = is_clutch, values_from = epa_per_play)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Define clutch situations
clutch_plays = pbp[
    (pbp["epa"].notna()) &
    (pbp["qtr"] == 4) &
    (pbp["wp"] >= 0.25) &
    (pbp["wp"] <= 0.75)
]

# QB clutch performance
qb_clutch = (clutch_plays[clutch_plays["passer_player_id"].notna()]
    .groupby(["passer_player_id", "passer_player_name"])
    .agg(
        clutch_plays=("epa", "count"),
        clutch_epa=("epa", "sum"),
        clutch_wpa=("wpa", "sum"),
        clutch_success=("success", "mean")
    )
    .reset_index())

qb_clutch = qb_clutch[qb_clutch["clutch_plays"] >= 30].sort_values(
    "clutch_wpa", ascending=False)

print("QB Clutch Performance (4Q, WP 25-75%):")
print(qb_clutch)
Packages: nflfastR tidyverse nfl_data_py pandas
Game Swing Plays
Identify the biggest momentum-shifting plays in games.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Find biggest swing plays (largest WPA changes)
swing_plays <- pbp %>%
  filter(!is.na(wpa)) %>%
  mutate(abs_wpa = abs(wpa)) %>%
  arrange(desc(abs_wpa)) %>%
  select(game_id, week, posteam, qtr, time, down, ydstogo,
         desc, wpa, wp) %>%
  head(50)

print(swing_plays)

# Game-changing interceptions
ints <- pbp %>%
  filter(interception == 1, !is.na(wpa)) %>%
  arrange(desc(abs(wpa))) %>%
  select(game_id, passer_player_name, interception_player_name,
         wpa, wp, desc) %>%
  head(10)

print(ints)

# Game-changing touchdowns
tds <- pbp %>%
  filter(touchdown == 1, !is.na(wpa)) %>%
  arrange(desc(wpa)) %>%
  select(game_id, posteam, qtr, wpa, wp, desc) %>%
  head(10)

print(tds)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Find biggest swing plays
swing_plays = pbp[pbp["wpa"].notna()].copy()
swing_plays["abs_wpa"] = swing_plays["wpa"].abs()
swing_plays = swing_plays.nlargest(50, "abs_wpa")[
    ["game_id", "week", "posteam", "qtr", "time", "down",
     "ydstogo", "desc", "wpa", "wp"]
]

print("Biggest Swing Plays:")
print(swing_plays.head(20))

# Game-changing interceptions
ints = (pbp[(pbp["interception"] == 1) & (pbp["wpa"].notna())]
    .assign(abs_wpa=lambda x: x["wpa"].abs())
    .nlargest(10, "abs_wpa")
    [["game_id", "passer_player_name", "interception_player_name", "wpa", "wp"]])

print("\nBiggest Interceptions by WPA:")
print(ints)
Packages: nflfastR tidyverse nfl_data_py pandas
Fourth Quarter Comeback Analysis
Track teams that came back from deficits in the fourth quarter.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Find games with 4th quarter comebacks
comebacks <- pbp %>%
  filter(qtr == 4) %>%
  group_by(game_id, home_team, away_team) %>%
  summarize(
    start_wp_home = first(home_wp),
    end_wp_home = last(home_wp),
    home_score = last(total_home_score),
    away_score = last(total_away_score),
    .groups = "drop"
  ) %>%
  mutate(
    home_won = home_score > away_score,
    comeback = (start_wp_home < 0.25 & home_won) |
               (start_wp_home > 0.75 & !home_won),
    wp_swing = abs(end_wp_home - start_wp_home)
  ) %>%
  filter(comeback) %>%
  arrange(desc(wp_swing))

print(comebacks)

# Teams with most comebacks
comeback_counts <- comebacks %>%
  mutate(
    comeback_team = if_else(home_won, home_team, away_team)
  ) %>%
  count(comeback_team, sort = TRUE)

print(comeback_counts)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter to 4th quarter plays
q4 = pbp[pbp["qtr"] == 4].copy()

# Get start and end WP for each game
game_wp = (q4.groupby(["game_id", "home_team", "away_team"])
    .agg(
        start_wp_home=("home_wp", "first"),
        end_wp_home=("home_wp", "last"),
        home_score=("total_home_score", "last"),
        away_score=("total_away_score", "last")
    )
    .reset_index())

game_wp["home_won"] = game_wp["home_score"] > game_wp["away_score"]
game_wp["comeback"] = (
    ((game_wp["start_wp_home"] < 0.25) & game_wp["home_won"]) |
    ((game_wp["start_wp_home"] > 0.75) & ~game_wp["home_won"])
)
game_wp["wp_swing"] = (game_wp["end_wp_home"] - game_wp["start_wp_home"]).abs()

comebacks = game_wp[game_wp["comeback"]].sort_values("wp_swing", ascending=False)
print("4th Quarter Comebacks:")
print(comebacks)
Packages: nflfastR tidyverse nfl_data_py pandas
Win Probability Charts
Create win probability charts for individual games.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Select a specific game (e.g., Super Bowl or exciting game)
game <- pbp %>%
  filter(week == 1, home_team == "KC") %>%
  filter(!is.na(wp))

# Create WP chart data
wp_chart <- game %>%
  select(game_seconds_remaining, wp, posteam, home_team, away_team) %>%
  mutate(
    game_time = 3600 - game_seconds_remaining,
    home_wp = if_else(posteam == home_team, wp, 1 - wp)
  )

# Plot
ggplot(wp_chart, aes(x = game_time, y = home_wp)) +
  geom_line(color = "#E31837", size = 1) +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray50") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(
    breaks = c(0, 900, 1800, 2700, 3600),
    labels = c("Q1", "Q2", "Q3", "Q4", "End")
  ) +
  labs(
    title = paste(unique(game$away_team), "@", unique(game$home_team)),
    x = "Game Time",
    y = "Home Win Probability"
  ) +
  theme_minimal()
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt

pbp = nfl.import_pbp_data([2023])

# Select a specific game
game = pbp[(pbp["week"] == 1) & (pbp["home_team"] == "KC")]
game = game[game["wp"].notna()].copy()

# Calculate game time and home WP
game["game_time"] = 3600 - game["game_seconds_remaining"]
game["home_wp"] = game.apply(
    lambda x: x["wp"] if x["posteam"] == x["home_team"] else 1 - x["wp"],
    axis=1
)

# Create plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(game["game_time"], game["home_wp"], color="#E31837", linewidth=2)
ax.axhline(y=0.5, color="gray", linestyle="--", alpha=0.5)

ax.set_xlim(0, 3600)
ax.set_ylim(0, 1)
ax.set_xticks([0, 900, 1800, 2700, 3600])
ax.set_xticklabels(["Q1", "Q2", "Q3", "Q4", "End"])
ax.set_ylabel("Home Win Probability")
ax.set_title(f"{game['away_team'].iloc[0]} @ {game['home_team'].iloc[0]}")

plt.tight_layout()
plt.savefig("wp_chart.png", dpi=300)
plt.show()
Packages: nflfastR tidyverse ggplot2 nfl_data_py pandas matplotlib
Expected Points vs Win Probability
Compare EPA and WPA to understand their relationship.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Correlation between EPA and WPA
plays <- pbp %>%
  filter(!is.na(epa), !is.na(wpa))

cat("Correlation EPA vs WPA:", cor(plays$epa, plays$wpa), "\n")

# EPA vs WPA by game situation
situation_comparison <- plays %>%
  mutate(
    situation = case_when(
      qtr <= 2 ~ "First Half",
      wp > 0.75 | wp < 0.25 ~ "Blowout",
      TRUE ~ "Competitive"
    )
  ) %>%
  group_by(situation) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa),
    avg_wpa = mean(wpa),
    epa_wpa_cor = cor(epa, wpa)
  )

print(situation_comparison)

# High EPA but low WPA (garbage time) vs low EPA high WPA (clutch)
plays %>%
  mutate(
    epa_quartile = ntile(epa, 4),
    wpa_quartile = ntile(wpa, 4)
  ) %>%
  count(epa_quartile, wpa_quartile) %>%
  pivot_wider(names_from = wpa_quartile, values_from = n)
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Filter plays with both EPA and WPA
plays = pbp[(pbp["epa"].notna()) & (pbp["wpa"].notna())]

# Overall correlation
correlation = plays["epa"].corr(plays["wpa"])
print(f"EPA vs WPA Correlation: {correlation:.3f}")

# By game situation
def get_situation(row):
    if row["qtr"] <= 2:
        return "First Half"
    elif row["wp"] > 0.75 or row["wp"] < 0.25:
        return "Blowout"
    else:
        return "Competitive"

plays["situation"] = plays.apply(get_situation, axis=1)

situation_comparison = (plays.groupby("situation")
    .agg(
        plays=("epa", "count"),
        avg_epa=("epa", "mean"),
        avg_wpa=("wpa", "mean")
    )
    .reset_index())

print("\nEPA vs WPA by Situation:")
print(situation_comparison)
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Late-Game Decision Analysis
Analyze coaching decisions in crucial late-game situations using WP.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Fourth down decisions in close 4th quarter games
late_4th_downs <- pbp %>%
  filter(
    down == 4,
    qtr == 4,
    wp >= 0.20 & wp <= 0.80,
    !is.na(fourth_down_decision)
  )

# Decision distribution
late_4th_downs %>%
  count(fourth_down_decision)

# WP impact by decision
decision_impact <- late_4th_downs %>%
  group_by(fourth_down_decision) %>%
  summarize(
    attempts = n(),
    avg_wpa = mean(wpa, na.rm = TRUE),
    success_rate = mean(fourth_down_converted, na.rm = TRUE),
    avg_wp_before = mean(wp)
  )

print(decision_impact)

# Aggressive vs conservative coaches
coach_decisions <- late_4th_downs %>%
  group_by(posteam) %>%
  summarize(
    fourth_downs = n(),
    go_for_it_rate = mean(fourth_down_decision == "go"),
    punt_rate = mean(fourth_down_decision == "punt"),
    fg_rate = mean(fourth_down_decision == "field_goal")
  ) %>%
  arrange(desc(go_for_it_rate))

print(coach_decisions)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Fourth down decisions in close 4th quarter games
late_4th = pbp[
    (pbp["down"] == 4) &
    (pbp["qtr"] == 4) &
    (pbp["wp"] >= 0.20) &
    (pbp["wp"] <= 0.80) &
    (pbp["fourth_down_decision"].notna())
]

# Decision distribution
print("4th Down Decision Distribution:")
print(late_4th["fourth_down_decision"].value_counts())

# WP impact by decision
decision_impact = (late_4th.groupby("fourth_down_decision")
    .agg(
        attempts=("wpa", "count"),
        avg_wpa=("wpa", "mean"),
        avg_wp_before=("wp", "mean")
    )
    .reset_index())

print("\nDecision Impact:")
print(decision_impact)

# Team aggressiveness
team_decisions = (late_4th.groupby("posteam")
    .agg(fourth_downs=("down", "count"))
    .reset_index())

go_counts = late_4th[late_4th["fourth_down_decision"] == "go"].groupby("posteam").size()
team_decisions = team_decisions.merge(
    go_counts.reset_index(name="go_for_it"),
    on="posteam", how="left"
).fillna(0)
team_decisions["go_rate"] = team_decisions["go_for_it"] / team_decisions["fourth_downs"]

print("\nTeam Aggressiveness:")
print(team_decisions.sort_values("go_rate", ascending=False))
Packages: nflfastR tidyverse nfl_data_py pandas

Data Visualization

Create publication-quality charts and visualizations for NFL data

EPA Scatter Plot with Team Logos
Create an offense vs defense EPA plot with team logos.
Intermediate
library(nflfastR)
library(nflplotR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate team EPA
team_epa <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(posteam) %>%
  summarize(off_epa = mean(epa)) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
      group_by(defteam) %>%
      summarize(def_epa = mean(epa)),
    by = c("posteam" = "defteam")
  )

# Create plot with logos
ggplot(team_epa, aes(x = off_epa, y = def_epa)) +
  geom_mean_lines(aes(y0 = def_epa, x0 = off_epa)) +
  geom_nfl_logos(aes(team_abbr = posteam), width = 0.065) +
  labs(
    title = "NFL Team Efficiency (2023)",
    subtitle = "Offensive vs Defensive EPA/Play",
    x = "Offensive EPA/Play (higher is better)",
    y = "Defensive EPA/Play (lower is better)",
    caption = "Data: nflfastR"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.title = element_text(size = 12)
  )

ggsave("epa_scatter.png", width = 10, height = 8, dpi = 300)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Calculate team EPA
plays = pbp[(pbp["epa"].notna()) &
            (pbp["play_type"].isin(["pass", "run"]))]

off_epa = plays.groupby("posteam")["epa"].mean().reset_index()
off_epa.columns = ["team", "off_epa"]

def_epa = plays.groupby("defteam")["epa"].mean().reset_index()
def_epa.columns = ["team", "def_epa"]

team_epa = off_epa.merge(def_epa, on="team")

# Create scatter plot
fig, ax = plt.subplots(figsize=(12, 10))

ax.scatter(team_epa["off_epa"], team_epa["def_epa"], s=100, alpha=0.7)

# Add team labels
for idx, row in team_epa.iterrows():
    ax.annotate(row["team"], (row["off_epa"], row["def_epa"]),
                fontsize=9, ha="center", va="bottom")

# Add mean lines
ax.axhline(y=team_epa["def_epa"].mean(), color="gray",
           linestyle="--", alpha=0.5)
ax.axvline(x=team_epa["off_epa"].mean(), color="gray",
           linestyle="--", alpha=0.5)

ax.set_xlabel("Offensive EPA/Play (higher is better)", fontsize=12)
ax.set_ylabel("Defensive EPA/Play (lower is better)", fontsize=12)
ax.set_title("NFL Team Efficiency (2023)", fontsize=16, fontweight="bold")
ax.invert_yaxis()  # Lower def EPA is better

plt.tight_layout()
plt.savefig("epa_scatter.png", dpi=300)
plt.show()
Packages: nflfastR nflplotR ggplot2 tidyverse matplotlib pandas
Weekly Performance Line Chart
Track team EPA performance across the season with a line chart.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Get weekly EPA for specific teams
teams_of_interest <- c("KC", "SF", "BAL", "DET")

weekly_epa <- pbp %>%
  filter(!is.na(epa), posteam %in% teams_of_interest) %>%
  group_by(posteam, week) %>%
  summarize(epa_per_play = mean(epa), .groups = "drop")

# Create line chart
ggplot(weekly_epa, aes(x = week, y = epa_per_play,
                        color = posteam, group = posteam)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  scale_color_manual(values = c("KC" = "#E31837", "SF" = "#AA0000",
                                "BAL" = "#241773", "DET" = "#0076B6")) +
  labs(
    title = "Weekly EPA/Play Trends (2023)",
    x = "Week",
    y = "EPA/Play",
    color = "Team"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

ggsave("weekly_epa.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt

pbp = nfl.import_pbp_data([2023])

# Get weekly EPA for specific teams
teams = ["KC", "SF", "BAL", "DET"]
team_colors = {"KC": "#E31837", "SF": "#AA0000",
               "BAL": "#241773", "DET": "#0076B6"}

plays = pbp[(pbp["epa"].notna()) & (pbp["posteam"].isin(teams))]
weekly_epa = plays.groupby(["posteam", "week"])["epa"].mean().reset_index()

# Create line chart
fig, ax = plt.subplots(figsize=(12, 6))

for team in teams:
    team_data = weekly_epa[weekly_epa["posteam"] == team]
    ax.plot(team_data["week"], team_data["epa"],
            color=team_colors[team], label=team,
            linewidth=2, marker="o", markersize=5)

ax.axhline(y=0, color="gray", linestyle="--", alpha=0.5)
ax.set_xlabel("Week", fontsize=12)
ax.set_ylabel("EPA/Play", fontsize=12)
ax.set_title("Weekly EPA/Play Trends (2023)", fontsize=14, fontweight="bold")
ax.legend(loc="lower right")

plt.tight_layout()
plt.savefig("weekly_epa.png", dpi=300)
plt.show()
Packages: nflfastR ggplot2 tidyverse matplotlib pandas
Pass Chart Visualization
Create a field visualization showing pass locations and outcomes.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Get pass data for a specific QB
qb_passes <- pbp %>%
  filter(
    passer_player_name == "P.Mahomes",
    !is.na(air_yards),
    !is.na(pass_location)
  ) %>%
  mutate(
    x = case_when(
      pass_location == "left" ~ -10,
      pass_location == "middle" ~ 0,
      pass_location == "right" ~ 10
    ),
    y = air_yards,
    result = case_when(
      interception == 1 ~ "INT",
      complete_pass == 1 ~ "Complete",
      TRUE ~ "Incomplete"
    )
  )

# Create pass chart
ggplot(qb_passes, aes(x = x, y = y, color = result)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_hline(yintercept = 0, color = "darkgreen", size = 2) +
  scale_color_manual(values = c("Complete" = "#00AA00",
                                "Incomplete" = "#AA0000",
                                "INT" = "#000000")) +
  coord_cartesian(xlim = c(-20, 20), ylim = c(-5, 40)) +
  labs(
    title = "Patrick Mahomes Pass Chart (2023)",
    x = "Field Width",
    y = "Air Yards",
    color = "Result"
  ) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    legend.position = "right"
  )

ggsave("pass_chart.png", width = 8, height = 10)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt

pbp = nfl.import_pbp_data([2023])

# Get pass data for a specific QB
qb_passes = pbp[
    (pbp["passer_player_name"] == "P.Mahomes") &
    (pbp["air_yards"].notna()) &
    (pbp["pass_location"].notna())
].copy()

# Map pass location to x coordinate
location_map = {"left": -10, "middle": 0, "right": 10}
qb_passes["x"] = qb_passes["pass_location"].map(location_map)
qb_passes["y"] = qb_passes["air_yards"]

# Determine result
def get_result(row):
    if row["interception"] == 1: return "INT"
    elif row["complete_pass"] == 1: return "Complete"
    else: return "Incomplete"

qb_passes["result"] = qb_passes.apply(get_result, axis=1)

# Create pass chart
fig, ax = plt.subplots(figsize=(8, 10))

colors = {"Complete": "#00AA00", "Incomplete": "#AA0000", "INT": "#000000"}
for result, color in colors.items():
    data = qb_passes[qb_passes["result"] == result]
    ax.scatter(data["x"], data["y"], c=color, label=result,
               alpha=0.6, s=30)

ax.axhline(y=0, color="darkgreen", linewidth=3)
ax.set_xlim(-20, 20)
ax.set_ylim(-5, 40)
ax.set_xlabel("Field Width")
ax.set_ylabel("Air Yards")
ax.set_title("Patrick Mahomes Pass Chart (2023)", fontweight="bold")
ax.legend()

plt.tight_layout()
plt.savefig("pass_chart.png", dpi=300)
plt.show()
Packages: nflfastR ggplot2 tidyverse matplotlib pandas
Heatmap Visualization
Create heatmaps for play type efficiency by down and distance.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Create EPA heatmap by down and distance bucket
heatmap_data <- pbp %>%
  filter(!is.na(epa), down <= 4, !is.na(ydstogo)) %>%
  mutate(
    distance_bucket = case_when(
      ydstogo <= 3 ~ "1-3",
      ydstogo <= 6 ~ "4-6",
      ydstogo <= 10 ~ "7-10",
      TRUE ~ "10+"
    )
  ) %>%
  group_by(down, distance_bucket) %>%
  summarize(
    avg_epa = mean(epa),
    plays = n(),
    .groups = "drop"
  )

# Create heatmap
ggplot(heatmap_data, aes(x = distance_bucket, y = factor(down), fill = avg_epa)) +
  geom_tile(color = "white", size = 0.5) +
  geom_text(aes(label = round(avg_epa, 3)), color = "white", size = 4) +
  scale_fill_gradient2(low = "#BB0000", mid = "gray90", high = "#006400",
                        midpoint = 0, name = "EPA/Play") +
  labs(
    title = "EPA by Down and Distance (2023)",
    x = "Yards to Go",
    y = "Down"
  ) +
  theme_minimal() +
  theme(panel.grid = element_blank())

ggsave("epa_heatmap.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Filter valid plays
plays = pbp[(pbp["epa"].notna()) & (pbp["down"] <= 4) & (pbp["ydstogo"].notna())]

# Create distance buckets
def distance_bucket(yd):
    if yd <= 3: return "1-3"
    elif yd <= 6: return "4-6"
    elif yd <= 10: return "7-10"
    else: return "10+"

plays["distance_bucket"] = plays["ydstogo"].apply(distance_bucket)

# Calculate EPA by down and distance
heatmap_data = plays.groupby(["down", "distance_bucket"])["epa"].mean().reset_index()
pivot_data = heatmap_data.pivot(index="down", columns="distance_bucket", values="epa")

# Reorder columns
col_order = ["1-3", "4-6", "7-10", "10+"]
pivot_data = pivot_data[[c for c in col_order if c in pivot_data.columns]]

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(pivot_data, annot=True, fmt=".3f", cmap="RdYlGn", center=0,
            linewidths=0.5, ax=ax, cbar_kws={"label": "EPA/Play"})

ax.set_xlabel("Yards to Go")
ax.set_ylabel("Down")
ax.set_title("EPA by Down and Distance (2023)", fontweight="bold")

plt.tight_layout()
plt.savefig("epa_heatmap.png", dpi=300)
plt.show()
Packages: nflfastR ggplot2 tidyverse seaborn matplotlib pandas
Radar/Spider Chart
Create radar charts comparing QB performance across metrics.
Advanced
library(nflfastR)
library(tidyverse)
library(fmsb)  # For radar charts

pbp <- load_pbp(2023)

# Calculate QB metrics
qb_metrics <- pbp %>%
  filter(!is.na(passer_player_id), play_type == "pass") %>%
  group_by(passer_player_name) %>%
  summarize(
    attempts = n(),
    epa_play = mean(epa, na.rm = TRUE),
    cpoe = mean(cpoe, na.rm = TRUE),
    avg_air = mean(air_yards, na.rm = TRUE),
    deep_pct = mean(air_yards >= 20, na.rm = TRUE) * 100,
    sack_rate = mean(sack == 1, na.rm = TRUE) * 100,
    .groups = "drop"
  ) %>%
  filter(attempts >= 300)

# Normalize to 0-100 scale for radar
normalize <- function(x) (x - min(x)) / (max(x) - min(x)) * 100

qb_radar <- qb_metrics %>%
  mutate(across(c(epa_play, cpoe, avg_air, deep_pct), normalize)) %>%
  mutate(sack_rate = 100 - normalize(sack_rate))  # Invert - lower is better

# Get top QBs for comparison
top_qbs <- c("P.Mahomes", "J.Allen", "L.Jackson")
radar_data <- qb_radar %>%
  filter(passer_player_name %in% top_qbs) %>%
  select(passer_player_name, epa_play, cpoe, avg_air, deep_pct, sack_rate)

print(radar_data)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import pi

pbp = nfl.import_pbp_data([2023])

# Calculate QB metrics
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["play_type"] == "pass")]

qb_metrics = (passes.groupby("passer_player_name")
    .agg(
        attempts=("play_id", "count"),
        epa_play=("epa", "mean"),
        cpoe=("cpoe", "mean"),
        avg_air=("air_yards", "mean"),
        deep_pct=("air_yards", lambda x: (x >= 20).mean() * 100),
        sack_rate=("sack", "mean")
    )
    .reset_index())

qb_metrics = qb_metrics[qb_metrics["attempts"] >= 300]

# Normalize to 0-100
def normalize(col):
    return (col - col.min()) / (col.max() - col.min()) * 100

for col in ["epa_play", "cpoe", "avg_air", "deep_pct"]:
    qb_metrics[col + "_norm"] = normalize(qb_metrics[col])

# Invert sack rate (lower is better)
qb_metrics["sack_norm"] = 100 - normalize(qb_metrics["sack_rate"] * 100)

# Create radar chart for top QBs
categories = ["EPA/Play", "CPOE", "Air Yards", "Deep%", "Sack Avoidance"]
qbs = ["P.Mahomes", "J.Allen", "L.Jackson"]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
angles += angles[:1]

for qb in qbs:
    data = qb_metrics[qb_metrics["passer_player_name"] == qb]
    if not data.empty:
        values = [data["epa_play_norm"].iloc[0], data["cpoe_norm"].iloc[0],
                  data["avg_air_norm"].iloc[0], data["deep_pct_norm"].iloc[0],
                  data["sack_norm"].iloc[0]]
        values += values[:1]
        ax.plot(angles, values, linewidth=2, label=qb)
        ax.fill(angles, values, alpha=0.1)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_title("QB Comparison Radar Chart", fontweight="bold")
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
plt.tight_layout()
plt.savefig("qb_radar.png", dpi=300)
plt.show()
Packages: nflfastR tidyverse fmsb matplotlib pandas numpy
Box Plot Comparisons
Compare distributions using box plots for player or team analysis.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# EPA distribution by play type
play_epa <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run"))

# Create box plot
ggplot(play_epa, aes(x = play_type, y = epa, fill = play_type)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  coord_cartesian(ylim = c(-3, 3)) +
  scale_fill_manual(values = c("pass" = "#1E90FF", "run" = "#228B22")) +
  labs(
    title = "EPA Distribution by Play Type (2023)",
    x = "Play Type",
    y = "EPA",
    fill = "Type"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggsave("epa_boxplot.png", width = 8, height = 6)

# Box plot by down
ggplot(play_epa %>% filter(!is.na(down), down <= 4),
       aes(x = factor(down), y = epa, fill = factor(down))) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  coord_cartesian(ylim = c(-2, 2)) +
  labs(title = "EPA by Down", x = "Down", y = "EPA") +
  theme_minimal() +
  theme(legend.position = "none")

ggsave("epa_by_down.png", width = 8, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pbp = nfl.import_pbp_data([2023])

# Filter plays with EPA
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

# Create box plot by play type
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: By play type
sns.boxplot(data=plays, x="play_type", y="epa", ax=axes[0],
            palette={"pass": "#1E90FF", "run": "#228B22"},
            showfliers=False)
axes[0].set_ylim(-3, 3)
axes[0].set_xlabel("Play Type")
axes[0].set_ylabel("EPA")
axes[0].set_title("EPA Distribution by Play Type (2023)")

# Plot 2: By down
plays_by_down = plays[(plays["down"].notna()) & (plays["down"] <= 4)]
sns.boxplot(data=plays_by_down, x="down", y="epa", ax=axes[1],
            palette="viridis", showfliers=False)
axes[1].set_ylim(-2, 2)
axes[1].set_xlabel("Down")
axes[1].set_ylabel("EPA")
axes[1].set_title("EPA Distribution by Down (2023)")

plt.tight_layout()
plt.savefig("epa_boxplots.png", dpi=300)
plt.show()
Packages: nflfastR ggplot2 tidyverse seaborn matplotlib pandas
Interactive Plotly Dashboard
Create interactive visualizations with hover tooltips.
Advanced
library(nflfastR)
library(tidyverse)
library(plotly)

pbp <- load_pbp(2023)

# Calculate team stats
team_stats <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(posteam) %>%
  summarize(
    off_epa = mean(epa),
    pass_rate = mean(play_type == "pass") * 100,
    plays = n(),
    .groups = "drop"
  ) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
      group_by(defteam) %>%
      summarize(def_epa = mean(epa)),
    by = c("posteam" = "defteam")
  )

# Create interactive scatter plot
p <- plot_ly(team_stats,
  x = ~off_epa,
  y = ~def_epa,
  text = ~paste("Team:", posteam,
                "<br>Off EPA:", round(off_epa, 3),
                "<br>Def EPA:", round(def_epa, 3),
                "<br>Pass Rate:", round(pass_rate, 1), "%"),
  hoverinfo = "text",
  type = "scatter",
  mode = "markers",
  marker = list(size = 12, opacity = 0.8)
) %>%
  layout(
    title = "NFL Team Efficiency (2023)",
    xaxis = list(title = "Offensive EPA/Play"),
    yaxis = list(title = "Defensive EPA/Play", autorange = "reversed")
  )

# Save or display
htmlwidgets::saveWidget(p, "team_efficiency.html")
import nfl_data_py as nfl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

pbp = nfl.import_pbp_data([2023])

# Calculate team stats
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

off_stats = (plays.groupby("posteam")
    .agg(
        off_epa=("epa", "mean"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
        plays=("epa", "count")
    )
    .reset_index())

def_stats = (plays.groupby("defteam")
    .agg(def_epa=("epa", "mean"))
    .reset_index())

team_stats = off_stats.merge(def_stats, left_on="posteam", right_on="defteam")

# Create interactive scatter with Plotly
fig = px.scatter(
    team_stats,
    x="off_epa",
    y="def_epa",
    text="posteam",
    hover_data={"off_epa": ":.3f", "def_epa": ":.3f", "pass_rate": ":.1f"},
    title="NFL Team Efficiency (2023)"
)

fig.update_traces(textposition="top center", marker=dict(size=12))
fig.update_yaxes(autorange="reversed")  # Lower def EPA is better
fig.update_layout(
    xaxis_title="Offensive EPA/Play (higher is better)",
    yaxis_title="Defensive EPA/Play (lower is better)"
)

fig.write_html("team_efficiency.html")
fig.show()
Packages: nflfastR tidyverse plotly pandas
Histogram Distributions
Visualize distributions of key metrics with histograms.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# EPA histogram
epa_data <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run"))

ggplot(epa_data, aes(x = epa, fill = play_type)) +
  geom_histogram(bins = 50, alpha = 0.6, position = "identity") +
  scale_fill_manual(values = c("pass" = "#1E90FF", "run" = "#228B22")) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
  coord_cartesian(xlim = c(-5, 5)) +
  labs(
    title = "Distribution of EPA by Play Type (2023)",
    x = "Expected Points Added (EPA)",
    y = "Count",
    fill = "Play Type"
  ) +
  theme_minimal()

ggsave("epa_histogram.png", width = 10, height = 6)

# Air yards histogram
air_data <- pbp %>%
  filter(!is.na(air_yards), play_type == "pass")

ggplot(air_data, aes(x = air_yards)) +
  geom_histogram(bins = 40, fill = "#1E90FF", alpha = 0.7) +
  geom_vline(aes(xintercept = mean(air_yards)), color = "red",
             linetype = "dashed", size = 1) +
  labs(
    title = "Distribution of Air Yards (2023)",
    x = "Air Yards",
    y = "Count",
    caption = "Red line = mean"
  ) +
  theme_minimal()

ggsave("air_yards_histogram.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt

pbp = nfl.import_pbp_data([2023])

# EPA histogram by play type
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: EPA distribution
for play_type, color in [("pass", "#1E90FF"), ("run", "#228B22")]:
    data = plays[plays["play_type"] == play_type]["epa"]
    axes[0].hist(data, bins=50, alpha=0.6, color=color, label=play_type.capitalize())

axes[0].axvline(x=0, color="black", linestyle="--")
axes[0].set_xlim(-5, 5)
axes[0].set_xlabel("EPA")
axes[0].set_ylabel("Count")
axes[0].set_title("Distribution of EPA by Play Type (2023)")
axes[0].legend()

# Plot 2: Air yards distribution
air_data = pbp[(pbp["air_yards"].notna()) & (pbp["play_type"] == "pass")]
axes[1].hist(air_data["air_yards"], bins=40, color="#1E90FF", alpha=0.7)
mean_air = air_data["air_yards"].mean()
axes[1].axvline(x=mean_air, color="red", linestyle="--", label=f"Mean: {mean_air:.1f}")
axes[1].set_xlabel("Air Yards")
axes[1].set_ylabel("Count")
axes[1].set_title("Distribution of Air Yards (2023)")
axes[1].legend()

plt.tight_layout()
plt.savefig("histograms.png", dpi=300)
plt.show()
Packages: nflfastR ggplot2 tidyverse matplotlib pandas
Correlation Matrix
Visualize correlations between multiple metrics.
Intermediate
library(nflfastR)
library(tidyverse)
library(corrplot)

pbp <- load_pbp(2023)

# Calculate team metrics for correlation
team_metrics <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(posteam) %>%
  summarize(
    off_epa = mean(epa),
    pass_rate = mean(play_type == "pass") * 100,
    pass_epa = mean(epa[play_type == "pass"]),
    run_epa = mean(epa[play_type == "run"]),
    success_rate = mean(success, na.rm = TRUE) * 100,
    explosive_rate = mean(epa > 2, na.rm = TRUE) * 100,
    .groups = "drop"
  )

# Create correlation matrix
cor_matrix <- team_metrics %>%
  select(-posteam) %>%
  cor(use = "complete.obs")

# Visualize
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         addCoef.col = "black",
         number.cex = 0.7,
         tl.col = "black",
         title = "Team Offensive Metrics Correlation",
         mar = c(0, 0, 2, 0))

png("correlation_matrix.png", width = 800, height = 800)
corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", number.cex = 0.8)
dev.off()
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

def calculate_metrics(group):
    return pd.Series({
        "off_epa": group["epa"].mean(),
        "pass_rate": (group["play_type"] == "pass").mean() * 100,
        "pass_epa": group[group["play_type"] == "pass"]["epa"].mean(),
        "run_epa": group[group["play_type"] == "run"]["epa"].mean(),
        "success_rate": group["success"].mean() * 100,
        "explosive_rate": (group["epa"] > 2).mean() * 100
    })

team_metrics = plays.groupby("posteam").apply(calculate_metrics).reset_index()

# Calculate correlation matrix
metrics_only = team_metrics.drop("posteam", axis=1)
cor_matrix = metrics_only.corr()

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(cor_matrix, dtype=bool))

sns.heatmap(cor_matrix, mask=mask, annot=True, fmt=".2f",
            cmap="coolwarm", center=0, square=True,
            linewidths=0.5, ax=ax)

ax.set_title("Team Offensive Metrics Correlation", fontweight="bold")
plt.tight_layout()
plt.savefig("correlation_matrix.png", dpi=300)
plt.show()
Packages: nflfastR tidyverse corrplot seaborn matplotlib pandas
Time Series with Annotations
Create time series plots with key events annotated.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate rolling EPA for a team
team_rolling <- pbp %>%
  filter(posteam == "SF", !is.na(epa), play_type %in% c("pass", "run")) %>%
  arrange(game_id, play_id) %>%
  mutate(
    play_num = row_number(),
    rolling_epa = zoo::rollmean(epa, k = 50, fill = NA, align = "right")
  )

# Define key game events
key_games <- team_rolling %>%
  group_by(game_id, week) %>%
  summarize(
    play_num = max(play_num),
    rolling_epa = last(na.omit(rolling_epa)),
    .groups = "drop"
  ) %>%
  filter(week %in% c(1, 8, 18))  # Annotate select weeks

# Create time series with annotations
ggplot(team_rolling, aes(x = play_num, y = rolling_epa)) +
  geom_line(color = "#AA0000", size = 1) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  geom_point(data = key_games, color = "black", size = 3) +
  geom_text(data = key_games, aes(label = paste("Week", week)),
            vjust = -1.5, size = 3) +
  labs(
    title = "San Francisco 49ers - Rolling EPA/Play (2023)",
    subtitle = "50-play rolling average",
    x = "Play Number",
    y = "Rolling EPA/Play"
  ) +
  theme_minimal()

ggsave("rolling_epa.png", width = 12, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt

pbp = nfl.import_pbp_data([2023])

# Calculate rolling EPA for SF
sf_plays = pbp[(pbp["posteam"] == "SF") &
               (pbp["epa"].notna()) &
               (pbp["play_type"].isin(["pass", "run"]))].copy()

sf_plays = sf_plays.sort_values(["game_id", "play_id"])
sf_plays["play_num"] = range(1, len(sf_plays) + 1)
sf_plays["rolling_epa"] = sf_plays["epa"].rolling(window=50).mean()

# Get key week markers
key_weeks = [1, 8, 18]
annotations = []
for week in key_weeks:
    week_data = sf_plays[sf_plays["week"] == week]
    if not week_data.empty:
        annotations.append({
            "week": week,
            "play_num": week_data["play_num"].max(),
            "rolling_epa": week_data["rolling_epa"].dropna().iloc[-1] if not week_data["rolling_epa"].dropna().empty else 0
        })

# Create plot
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(sf_plays["play_num"], sf_plays["rolling_epa"],
        color="#AA0000", linewidth=1.5, label="Rolling EPA")
ax.axhline(y=0, color="gray", linestyle="--", alpha=0.7)

# Add annotations
for ann in annotations:
    ax.scatter(ann["play_num"], ann["rolling_epa"], color="black", s=50, zorder=5)
    ax.annotate(f"Week {ann[\"week\"]}", (ann["play_num"], ann["rolling_epa"]),
                textcoords="offset points", xytext=(0, 10), ha="center")

ax.set_xlabel("Play Number")
ax.set_ylabel("Rolling EPA/Play (50-play average)")
ax.set_title("San Francisco 49ers - Rolling EPA/Play (2023)", fontweight="bold")

plt.tight_layout()
plt.savefig("rolling_epa.png", dpi=300)
plt.show()
Packages: nflfastR tidyverse zoo matplotlib pandas
Field Position Visualization
Create field diagrams showing drive starts and scoring.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Get drive starting positions and outcomes
drive_data <- pbp %>%
  filter(!is.na(drive), !is.na(yardline_100)) %>%
  group_by(game_id, drive) %>%
  summarize(
    start_yard = first(yardline_100),
    result = last(fixed_drive_result),
    posteam = first(posteam),
    .groups = "drop"
  ) %>%
  filter(!is.na(result))

# Summarize by field position
field_summary <- drive_data %>%
  mutate(
    zone = case_when(
      start_yard >= 80 ~ "Own 0-20",
      start_yard >= 60 ~ "Own 20-40",
      start_yard >= 40 ~ "Midfield",
      start_yard >= 20 ~ "Opp 20-40",
      TRUE ~ "Red Zone"
    ),
    scored = result == "Touchdown"
  ) %>%
  group_by(zone) %>%
  summarize(
    drives = n(),
    tds = sum(scored),
    td_rate = mean(scored) * 100,
    .groups = "drop"
  )

# Create bar chart
ggplot(field_summary, aes(x = reorder(zone, -td_rate), y = td_rate, fill = td_rate)) +
  geom_col() +
  geom_text(aes(label = paste0(round(td_rate, 1), "%")), vjust = -0.5) +
  scale_fill_gradient(low = "#FFC107", high = "#28A745") +
  labs(
    title = "Touchdown Rate by Starting Field Position",
    x = "Starting Field Position",
    y = "TD Rate (%)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggsave("field_position.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Get drive starting positions and outcomes
drives = pbp[(pbp["drive"].notna()) & (pbp["yardline_100"].notna())]

drive_summary = (drives.groupby(["game_id", "drive"])
    .agg(
        start_yard=("yardline_100", "first"),
        result=("fixed_drive_result", "last")
    )
    .reset_index())

drive_summary = drive_summary[drive_summary["result"].notna()]

# Assign field zones
def get_zone(yard):
    if yard >= 80: return "Own 0-20"
    elif yard >= 60: return "Own 20-40"
    elif yard >= 40: return "Midfield"
    elif yard >= 20: return "Opp 20-40"
    else: return "Red Zone"

drive_summary["zone"] = drive_summary["start_yard"].apply(get_zone)
drive_summary["scored"] = drive_summary["result"] == "Touchdown"

# Calculate TD rate by zone
zone_stats = (drive_summary.groupby("zone")
    .agg(
        drives=("scored", "count"),
        tds=("scored", "sum"),
        td_rate=("scored", "mean")
    )
    .reset_index())

zone_stats["td_rate_pct"] = zone_stats["td_rate"] * 100
zone_stats = zone_stats.sort_values("td_rate_pct", ascending=False)

# Create bar chart
fig, ax = plt.subplots(figsize=(10, 6))

colors = plt.cm.RdYlGn(zone_stats["td_rate"])
bars = ax.bar(zone_stats["zone"], zone_stats["td_rate_pct"], color=colors)

for bar, rate in zip(bars, zone_stats["td_rate_pct"]):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
            f"{rate:.1f}%", ha="center", va="bottom")

ax.set_xlabel("Starting Field Position")
ax.set_ylabel("TD Rate (%)")
ax.set_title("Touchdown Rate by Starting Field Position", fontweight="bold")

plt.tight_layout()
plt.savefig("field_position.png", dpi=300)
plt.show()
Packages: nflfastR ggplot2 tidyverse matplotlib pandas
Animated Season Progress
Create animated visualizations showing team rankings over time.
Advanced
library(nflfastR)
library(tidyverse)
library(gganimate)
library(gifski)

pbp <- load_pbp(2023)

# Calculate cumulative EPA by week
weekly_cumulative <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(posteam, week) %>%
  summarize(weekly_epa = mean(epa), .groups = "drop") %>%
  arrange(posteam, week) %>%
  group_by(posteam) %>%
  mutate(cumulative_epa = cumsum(weekly_epa) / row_number()) %>%
  ungroup()

# Add rankings
weekly_ranked <- weekly_cumulative %>%
  group_by(week) %>%
  mutate(rank = rank(-cumulative_epa)) %>%
  ungroup()

# Create animated bar chart race
p <- ggplot(weekly_ranked,
            aes(x = rank, y = cumulative_epa, fill = posteam)) +
  geom_col(width = 0.8) +
  geom_text(aes(label = posteam), hjust = -0.1, size = 4) +
  coord_flip(clip = "off") +
  scale_x_reverse() +
  labs(
    title = "NFL EPA Rankings Through Week {closest_state}",
    x = "Rank",
    y = "Cumulative EPA/Play"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  transition_states(week, transition_length = 4, state_length = 1)

# Render animation
animate(p, nframes = 200, fps = 20, width = 800, height = 600)
anim_save("epa_race.gif")
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import matplotlib.animation as animation

pbp = nfl.import_pbp_data([2023])

# Calculate cumulative EPA by week
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

weekly_epa = (plays.groupby(["posteam", "week"])["epa"]
    .mean()
    .reset_index()
    .sort_values(["posteam", "week"]))

# Calculate cumulative average
weekly_epa["cumulative_epa"] = weekly_epa.groupby("posteam")["epa"].expanding().mean().reset_index(drop=True)

# Get rankings per week
weekly_epa["rank"] = weekly_epa.groupby("week")["cumulative_epa"].rank(ascending=False)

# Create static snapshot for a specific week (animation requires extra setup)
final_week = weekly_epa["week"].max()
final_data = weekly_epa[weekly_epa["week"] == final_week].sort_values("cumulative_epa", ascending=True)

fig, ax = plt.subplots(figsize=(10, 12))

colors = plt.cm.RdYlGn((final_data["cumulative_epa"] - final_data["cumulative_epa"].min()) /
                        (final_data["cumulative_epa"].max() - final_data["cumulative_epa"].min()))

bars = ax.barh(final_data["posteam"], final_data["cumulative_epa"], color=colors)

ax.set_xlabel("Cumulative EPA/Play")
ax.set_title(f"NFL EPA Rankings Through Week {final_week}", fontweight="bold")
ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)

for bar, epa in zip(bars, final_data["cumulative_epa"]):
    ax.text(epa + 0.005, bar.get_y() + bar.get_height()/2,
            f"{epa:.3f}", va="center", fontsize=8)

plt.tight_layout()
plt.savefig("epa_rankings.png", dpi=300)
plt.show()

print("Note: For true animation, use matplotlib FuncAnimation or plotly")
Packages: nflfastR tidyverse gganimate gifski matplotlib pandas

Passing Analysis

Deep dive into quarterback and passing game analytics

Air Yards Analysis
Analyze depth of target and air yards distribution.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# QB air yards analysis
qb_air_yards <- pbp %>%
  filter(!is.na(passer_player_id), !is.na(air_yards)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    attempts = n(),
    total_air_yards = sum(air_yards),
    avg_air_yards = mean(air_yards),
    deep_pct = mean(air_yards >= 20) * 100,
    short_pct = mean(air_yards < 5) * 100,
    .groups = "drop"
  ) %>%
  filter(attempts >= 200) %>%
  arrange(desc(avg_air_yards))

print(qb_air_yards)

# Air yards by result
pbp %>%
  filter(!is.na(air_yards)) %>%
  mutate(result = if_else(complete_pass == 1, "Complete", "Incomplete")) %>%
  group_by(result) %>%
  summarize(avg_air_yards = mean(air_yards))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# QB air yards analysis
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["air_yards"].notna())]

qb_air_yards = (passes.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        attempts=("air_yards", "count"),
        total_air_yards=("air_yards", "sum"),
        avg_air_yards=("air_yards", "mean"),
        deep_pct=("air_yards", lambda x: (x >= 20).mean() * 100)
    )
    .reset_index())

qb_air_yards = qb_air_yards[qb_air_yards["attempts"] >= 200].sort_values(
    "avg_air_yards", ascending=False)

print("QB Air Yards Analysis:")
print(qb_air_yards)
Packages: nflfastR tidyverse nfl_data_py pandas
Completion Percentage Over Expected (CPOE)
Analyze quarterback accuracy using CPOE metric.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# CPOE analysis
qb_cpoe <- pbp %>%
  filter(!is.na(passer_player_id), !is.na(cpoe)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    attempts = n(),
    completion_pct = mean(complete_pass) * 100,
    exp_completion_pct = mean(cp) * 100,
    cpoe = mean(cpoe),
    .groups = "drop"
  ) %>%
  filter(attempts >= 200) %>%
  arrange(desc(cpoe))

print(qb_cpoe)

# CPOE by depth of target
pbp %>%
  filter(!is.na(cpoe), !is.na(air_yards)) %>%
  mutate(
    depth = case_when(
      air_yards < 0 ~ "Behind LOS",
      air_yards < 10 ~ "Short (0-9)",
      air_yards < 20 ~ "Medium (10-19)",
      TRUE ~ "Deep (20+)"
    )
  ) %>%
  group_by(depth) %>%
  summarize(
    attempts = n(),
    avg_cpoe = mean(cpoe),
    completion_rate = mean(complete_pass)
  )
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# CPOE analysis
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["cpoe"].notna())]

qb_cpoe = (passes.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        attempts=("cpoe", "count"),
        completion_pct=("complete_pass", lambda x: x.mean() * 100),
        cpoe=("cpoe", "mean")
    )
    .reset_index())

qb_cpoe = qb_cpoe[qb_cpoe["attempts"] >= 200].sort_values("cpoe", ascending=False)

print("QB CPOE Rankings:")
print(qb_cpoe)
Packages: nflfastR tidyverse nfl_data_py pandas
Depth of Target Analysis
Break down passing efficiency by depth zones.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Define depth zones
depth_analysis <- pbp %>%
  filter(!is.na(air_yards), play_type == "pass") %>%
  mutate(
    depth_zone = case_when(
      air_yards < 0 ~ "Behind LOS",
      air_yards < 5 ~ "Short (0-4)",
      air_yards < 10 ~ "Intermediate (5-9)",
      air_yards < 15 ~ "Medium (10-14)",
      air_yards < 20 ~ "Deep (15-19)",
      TRUE ~ "Bomb (20+)"
    )
  ) %>%
  group_by(depth_zone) %>%
  summarize(
    attempts = n(),
    completions = sum(complete_pass),
    comp_pct = mean(complete_pass) * 100,
    avg_epa = mean(epa, na.rm = TRUE),
    td_rate = mean(pass_touchdown, na.rm = TRUE) * 100,
    int_rate = mean(interception) * 100,
    .groups = "drop"
  )

print(depth_analysis)

# Team deep passing rankings
pbp %>%
  filter(air_yards >= 20, !is.na(epa)) %>%
  group_by(posteam) %>%
  summarize(
    deep_attempts = n(),
    deep_comp_pct = mean(complete_pass) * 100,
    deep_epa = mean(epa)
  ) %>%
  arrange(desc(deep_epa))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter pass plays with air yards
passes = pbp[(pbp["air_yards"].notna()) & (pbp["play_type"] == "pass")]

# Depth zones
def get_depth_zone(ay):
    if ay < 0: return "Behind LOS"
    elif ay < 5: return "Short (0-4)"
    elif ay < 10: return "Intermediate (5-9)"
    elif ay < 15: return "Medium (10-14)"
    elif ay < 20: return "Deep (15-19)"
    else: return "Bomb (20+)"

passes["depth_zone"] = passes["air_yards"].apply(get_depth_zone)

depth_analysis = (passes.groupby("depth_zone")
    .agg(
        attempts=("air_yards", "count"),
        comp_pct=("complete_pass", lambda x: x.mean() * 100),
        avg_epa=("epa", "mean"),
        td_rate=("pass_touchdown", lambda x: x.mean() * 100)
    )
    .reset_index())

print("Passing Efficiency by Depth:")
print(depth_analysis)
Packages: nflfastR tidyverse nfl_data_py pandas
Pressure and Sack Rate
Analyze how quarterbacks perform under pressure.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# QB performance under pressure
qb_pressure <- pbp %>%
  filter(!is.na(passer_player_id), play_type == "pass") %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    dropbacks = n(),
    sacks = sum(sack),
    sack_rate = mean(sack) * 100,
    epa_no_sack = mean(epa[sack == 0], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(dropbacks >= 200) %>%
  arrange(sack_rate)

print(qb_pressure)

# EPA with vs without pressure (using sack as proxy)
pbp %>%
  filter(play_type == "pass", !is.na(epa)) %>%
  group_by(sack = as.logical(sack)) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa)
  )
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# QB pressure analysis
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["play_type"] == "pass")]

qb_pressure = (passes.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        dropbacks=("sack", "count"),
        sacks=("sack", "sum"),
        sack_rate=("sack", lambda x: x.mean() * 100)
    )
    .reset_index())

qb_pressure = qb_pressure[qb_pressure["dropbacks"] >= 200].sort_values("sack_rate")

print("QB Sack Rates (lowest is best):")
print(qb_pressure)
Packages: nflfastR tidyverse nfl_data_py pandas
Pass Location Heatmaps
Visualize pass distribution by field location.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Create pass location data
pass_locations <- pbp %>%
  filter(
    play_type == "pass",
    !is.na(air_yards),
    !is.na(pass_location)
  ) %>%
  mutate(
    x_loc = case_when(
      pass_location == "left" ~ -1,
      pass_location == "middle" ~ 0,
      pass_location == "right" ~ 1
    )
  )

# Team passing tendencies
team_tendencies <- pass_locations %>%
  group_by(posteam, pass_location) %>%
  summarize(
    attempts = n(),
    epa = mean(epa, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(posteam) %>%
  mutate(pct = attempts / sum(attempts) * 100)

# Visualize (sample for one team)
team_tendencies %>%
  filter(posteam == "KC") %>%
  ggplot(aes(x = pass_location, y = pct, fill = epa)) +
  geom_col() +
  scale_fill_gradient2(low = "red", mid = "white", high = "green", midpoint = 0) +
  labs(title = "KC Pass Location Tendencies", y = "% of Passes") +
  theme_minimal()
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt

pbp = nfl.import_pbp_data([2023])

# Pass location analysis
passes = pbp[
    (pbp["play_type"] == "pass") &
    (pbp["air_yards"].notna()) &
    (pbp["pass_location"].notna())
]

# Team passing tendencies
team_tendencies = (passes.groupby(["posteam", "pass_location"])
    .agg(
        attempts=("epa", "count"),
        epa=("epa", "mean")
    )
    .reset_index())

# Calculate percentages
team_totals = team_tendencies.groupby("posteam")["attempts"].transform("sum")
team_tendencies["pct"] = team_tendencies["attempts"] / team_totals * 100

print("Team Pass Location Tendencies:")
print(team_tendencies[team_tendencies["posteam"] == "KC"])
Packages: nflfastR tidyverse ggplot2 nfl_data_py pandas matplotlib
Dropback Success Rate
Calculate QB success rate on all dropback plays.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Dropback success rate (includes sacks)
qb_dropback <- pbp %>%
  filter(!is.na(passer_player_id), qb_dropback == 1) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    dropbacks = n(),
    success_rate = mean(success, na.rm = TRUE) * 100,
    epa_per_dropback = mean(epa, na.rm = TRUE),
    sack_rate = mean(sack) * 100,
    scramble_rate = mean(qb_scramble, na.rm = TRUE) * 100,
    .groups = "drop"
  ) %>%
  filter(dropbacks >= 200) %>%
  arrange(desc(success_rate))

print(qb_dropback)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Dropback success rate
dropbacks = pbp[(pbp["passer_player_id"].notna()) & (pbp["qb_dropback"] == 1)]

qb_dropback = (dropbacks.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        dropbacks=("success", "count"),
        success_rate=("success", lambda x: x.mean() * 100),
        epa_per_dropback=("epa", "mean"),
        sack_rate=("sack", lambda x: x.mean() * 100)
    )
    .reset_index())

qb_dropback = qb_dropback[qb_dropback["dropbacks"] >= 200].sort_values(
    "success_rate", ascending=False)

print("QB Dropback Success Rate:")
print(qb_dropback)
Packages: nflfastR tidyverse nfl_data_py pandas
Play Action Effectiveness
Compare play action vs standard passing efficiency.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Play action analysis
play_action <- pbp %>%
  filter(play_type == "pass", !is.na(epa)) %>%
  mutate(play_action = if_else(is.na(play_action), FALSE, play_action))

# Overall comparison
play_action %>%
  group_by(play_action) %>%
  summarize(
    attempts = n(),
    avg_epa = mean(epa),
    success_rate = mean(success) * 100,
    avg_air_yards = mean(air_yards, na.rm = TRUE)
  )

# Team play action usage and effectiveness
team_pa <- play_action %>%
  group_by(posteam) %>%
  summarize(
    total_passes = n(),
    pa_rate = mean(play_action) * 100,
    pa_epa = mean(epa[play_action]),
    no_pa_epa = mean(epa[!play_action]),
    pa_advantage = mean(epa[play_action]) - mean(epa[!play_action])
  ) %>%
  arrange(desc(pa_advantage))

print(team_pa)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Play action analysis
passes = pbp[(pbp["play_type"] == "pass") & (pbp["epa"].notna())].copy()
passes["play_action"] = passes["play_action"].fillna(False)

# Overall comparison
pa_comparison = (passes.groupby("play_action")
    .agg(
        attempts=("epa", "count"),
        avg_epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

print("Play Action vs Standard Passing:")
print(pa_comparison)

# Team play action effectiveness
team_pa = (passes.groupby("posteam")
    .apply(lambda x: pd.Series({
        "pa_rate": x["play_action"].mean() * 100,
        "pa_epa": x[x["play_action"]]["epa"].mean(),
        "no_pa_epa": x[~x["play_action"]]["epa"].mean()
    }))
    .reset_index())

team_pa["pa_advantage"] = team_pa["pa_epa"] - team_pa["no_pa_epa"]
print("\nTeam Play Action Effectiveness:")
print(team_pa.sort_values("pa_advantage", ascending=False))
Packages: nflfastR tidyverse nfl_data_py pandas
QB Scramble Analysis
Analyze quarterback scramble efficiency and tendencies.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# QB scramble analysis
qb_scrambles <- pbp %>%
  filter(!is.na(passer_player_id)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    dropbacks = sum(qb_dropback, na.rm = TRUE),
    scrambles = sum(qb_scramble, na.rm = TRUE),
    scramble_rate = mean(qb_scramble, na.rm = TRUE) * 100,
    scramble_yards = sum(yards_gained[qb_scramble == 1], na.rm = TRUE),
    scramble_epa = mean(epa[qb_scramble == 1], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(dropbacks >= 200, scrambles >= 10) %>%
  arrange(desc(scramble_rate))

print(qb_scrambles)

# Scramble success by situation
pbp %>%
  filter(qb_scramble == 1) %>%
  mutate(
    situation = case_when(
      down <= 2 & ydstogo <= 5 ~ "Short yardage",
      down == 3 ~ "Third down",
      down == 4 ~ "Fourth down",
      TRUE ~ "Normal"
    )
  ) %>%
  group_by(situation) %>%
  summarize(
    scrambles = n(),
    avg_yards = mean(yards_gained),
    success_rate = mean(success)
  )
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# QB scramble analysis
qb_plays = pbp[pbp["passer_player_id"].notna()]

qb_scrambles = (qb_plays.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        dropbacks=("qb_dropback", "sum"),
        scrambles=("qb_scramble", "sum"),
        scramble_rate=("qb_scramble", lambda x: x.mean() * 100)
    )
    .reset_index())

qb_scrambles = qb_scrambles[
    (qb_scrambles["dropbacks"] >= 200) &
    (qb_scrambles["scrambles"] >= 10)
].sort_values("scramble_rate", ascending=False)

print("QB Scramble Rates:")
print(qb_scrambles)
Packages: nflfastR tidyverse nfl_data_py pandas
Receiver Target Analysis
Analyze how teams distribute targets among receivers.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Top receivers by targets
receiver_targets <- pbp %>%
  filter(!is.na(receiver_player_id), play_type == "pass") %>%
  group_by(receiver_player_id, receiver_player_name, posteam) %>%
  summarize(
    targets = n(),
    receptions = sum(complete_pass),
    yards = sum(yards_gained, na.rm = TRUE),
    tds = sum(pass_touchdown),
    epa = sum(epa, na.rm = TRUE),
    avg_depth = mean(air_yards, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(targets))

# Add target share
receiver_targets <- receiver_targets %>%
  group_by(posteam) %>%
  mutate(
    team_targets = sum(targets),
    target_share = targets / team_targets * 100
  ) %>%
  ungroup()

print(receiver_targets %>% head(30))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Receiver target analysis
passes = pbp[(pbp["receiver_player_id"].notna()) & (pbp["play_type"] == "pass")]

receiver_targets = (passes.groupby(["receiver_player_id", "receiver_player_name", "posteam"])
    .agg(
        targets=("epa", "count"),
        receptions=("complete_pass", "sum"),
        yards=("yards_gained", "sum"),
        tds=("pass_touchdown", "sum"),
        epa=("epa", "sum"),
        avg_depth=("air_yards", "mean")
    )
    .reset_index()
    .sort_values("targets", ascending=False))

# Add target share
team_targets = receiver_targets.groupby("posteam")["targets"].transform("sum")
receiver_targets["target_share"] = receiver_targets["targets"] / team_targets * 100

print("Top Receivers by Targets:")
print(receiver_targets.head(30))
Packages: nflfastR tidyverse nfl_data_py pandas
Yards After Catch Analysis
Analyze receiver YAC ability and team YAC tendencies.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Receiver YAC analysis
receiver_yac <- pbp %>%
  filter(complete_pass == 1, !is.na(yards_after_catch)) %>%
  group_by(receiver_player_id, receiver_player_name) %>%
  summarize(
    receptions = n(),
    total_yac = sum(yards_after_catch),
    avg_yac = mean(yards_after_catch),
    yac_per_target = total_yac / n(),
    .groups = "drop"
  ) %>%
  filter(receptions >= 40) %>%
  arrange(desc(avg_yac))

print(receiver_yac)

# Team YAC vs Air Yards balance
team_yac <- pbp %>%
  filter(complete_pass == 1, !is.na(yards_after_catch)) %>%
  group_by(posteam) %>%
  summarize(
    completions = n(),
    avg_air_yards = mean(air_yards, na.rm = TRUE),
    avg_yac = mean(yards_after_catch),
    yac_pct = avg_yac / (avg_air_yards + avg_yac) * 100
  ) %>%
  arrange(desc(avg_yac))

print(team_yac)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Receiver YAC analysis
completions = pbp[(pbp["complete_pass"] == 1) & (pbp["yards_after_catch"].notna())]

receiver_yac = (completions.groupby(["receiver_player_id", "receiver_player_name"])
    .agg(
        receptions=("yards_after_catch", "count"),
        total_yac=("yards_after_catch", "sum"),
        avg_yac=("yards_after_catch", "mean")
    )
    .reset_index())

receiver_yac = receiver_yac[receiver_yac["receptions"] >= 40].sort_values(
    "avg_yac", ascending=False)

print("Top Receivers by YAC:")
print(receiver_yac.head(20))

# Team YAC analysis
team_yac = (completions.groupby("posteam")
    .agg(
        avg_air_yards=("air_yards", "mean"),
        avg_yac=("yards_after_catch", "mean")
    )
    .reset_index()
    .sort_values("avg_yac", ascending=False))

print("\nTeam YAC Rankings:")
print(team_yac)
Packages: nflfastR tidyverse nfl_data_py pandas

Rushing Analysis

Running back and rushing game analytics

Yards Before Contact
Analyze offensive line blocking using yards before contact.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Team rushing analysis
team_rushing <- pbp %>%
  filter(play_type == "run", !is.na(epa)) %>%
  group_by(posteam) %>%
  summarize(
    rush_attempts = n(),
    avg_yards = mean(yards_gained),
    rush_epa = mean(epa),
    success_rate = mean(success) * 100,
    .groups = "drop"
  ) %>%
  arrange(desc(rush_epa))

print(team_rushing)

# RB efficiency
rb_efficiency <- pbp %>%
  filter(!is.na(rusher_player_id), play_type == "run") %>%
  group_by(rusher_player_id, rusher_player_name) %>%
  summarize(
    attempts = n(),
    yards = sum(yards_gained),
    ypc = mean(yards_gained),
    epa = mean(epa, na.rm = TRUE),
    success_rate = mean(success) * 100,
    .groups = "drop"
  ) %>%
  filter(attempts >= 100) %>%
  arrange(desc(epa))

print(rb_efficiency)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Team rushing analysis
rushes = pbp[(pbp["play_type"] == "run") & (pbp["epa"].notna())]

team_rushing = (rushes.groupby("posteam")
    .agg(
        rush_attempts=("epa", "count"),
        avg_yards=("yards_gained", "mean"),
        rush_epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index()
    .sort_values("rush_epa", ascending=False))

print("Team Rushing Rankings:")
print(team_rushing)
Packages: nflfastR tidyverse nfl_data_py pandas
Run Direction Efficiency
Compare rushing efficiency by run gap and direction.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Run direction analysis
run_direction <- pbp %>%
  filter(play_type == "run", !is.na(run_location), !is.na(run_gap)) %>%
  group_by(run_location, run_gap) %>%
  summarize(
    attempts = n(),
    avg_yards = mean(yards_gained),
    epa = mean(epa, na.rm = TRUE),
    success_rate = mean(success) * 100,
    .groups = "drop"
  ) %>%
  arrange(desc(epa))

print(run_direction)

# Team run direction tendencies
team_direction <- pbp %>%
  filter(play_type == "run", !is.na(run_location)) %>%
  group_by(posteam, run_location) %>%
  summarize(attempts = n(), .groups = "drop") %>%
  group_by(posteam) %>%
  mutate(pct = attempts / sum(attempts) * 100) %>%
  pivot_wider(names_from = run_location, values_from = c(attempts, pct))

print(team_direction)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Run direction analysis
rushes = pbp[(pbp["play_type"] == "run") &
             (pbp["run_location"].notna()) &
             (pbp["run_gap"].notna())]

run_direction = (rushes.groupby(["run_location", "run_gap"])
    .agg(
        attempts=("epa", "count"),
        avg_yards=("yards_gained", "mean"),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index()
    .sort_values("epa", ascending=False))

print("Run Efficiency by Direction:")
print(run_direction)
Packages: nflfastR tidyverse nfl_data_py pandas
Box Count Impact
Analyze how defenders in the box affect rushing success.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Box count analysis
box_analysis <- pbp %>%
  filter(play_type == "run", !is.na(defenders_in_box), !is.na(epa)) %>%
  group_by(defenders_in_box) %>%
  summarize(
    attempts = n(),
    avg_yards = mean(yards_gained),
    epa = mean(epa),
    success_rate = mean(success) * 100,
    .groups = "drop"
  ) %>%
  filter(attempts >= 100)

print(box_analysis)

# Team success against stacked boxes (8+)
stacked_box <- pbp %>%
  filter(play_type == "run", defenders_in_box >= 8, !is.na(epa)) %>%
  group_by(posteam) %>%
  summarize(
    stacked_attempts = n(),
    stacked_epa = mean(epa),
    stacked_success = mean(success) * 100
  ) %>%
  arrange(desc(stacked_epa))

print(stacked_box)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Box count analysis
rushes = pbp[(pbp["play_type"] == "run") &
             (pbp["defenders_in_box"].notna()) &
             (pbp["epa"].notna())]

box_analysis = (rushes.groupby("defenders_in_box")
    .agg(
        attempts=("epa", "count"),
        avg_yards=("yards_gained", "mean"),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

box_analysis = box_analysis[box_analysis["attempts"] >= 100]
print("Rushing by Defenders in Box:")
print(box_analysis)
Packages: nflfastR tidyverse nfl_data_py pandas
Goal Line Rushing
Analyze rushing efficiency in goal-to-go situations.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Goal line rushing (inside 5 yard line)
goal_line <- pbp %>%
  filter(play_type == "run", yardline_100 <= 5, !is.na(epa))

# Overall goal line rushing
goal_line %>%
  summarize(
    attempts = n(),
    td_rate = mean(rush_touchdown) * 100,
    success_rate = mean(success) * 100,
    avg_yards = mean(yards_gained)
  )

# Team goal line rushing
team_goal_line <- goal_line %>%
  group_by(posteam) %>%
  summarize(
    attempts = n(),
    touchdowns = sum(rush_touchdown),
    td_rate = mean(rush_touchdown) * 100,
    success_rate = mean(success) * 100
  ) %>%
  filter(attempts >= 10) %>%
  arrange(desc(td_rate))

print(team_goal_line)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Goal line rushing
goal_line = pbp[(pbp["play_type"] == "run") &
                (pbp["yardline_100"] <= 5) &
                (pbp["epa"].notna())]

# Team goal line rushing
team_goal_line = (goal_line.groupby("posteam")
    .agg(
        attempts=("rush_touchdown", "count"),
        touchdowns=("rush_touchdown", "sum"),
        td_rate=("rush_touchdown", lambda x: x.mean() * 100),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

team_goal_line = team_goal_line[team_goal_line["attempts"] >= 10].sort_values(
    "td_rate", ascending=False)

print("Goal Line Rushing Efficiency:")
print(team_goal_line)
Packages: nflfastR tidyverse nfl_data_py pandas
RB Receiving vs Rushing Split
Compare running back value as rushers vs receivers.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# RB rushing stats
rb_rush <- pbp %>%
  filter(!is.na(rusher_player_id), play_type == "run") %>%
  group_by(rusher_player_id) %>%
  summarize(
    rush_att = n(),
    rush_yards = sum(yards_gained),
    rush_epa = sum(epa, na.rm = TRUE),
    .groups = "drop"
  )

# RB receiving stats
rb_rec <- pbp %>%
  filter(!is.na(receiver_player_id), play_type == "pass") %>%
  group_by(receiver_player_id) %>%
  summarize(
    targets = n(),
    receptions = sum(complete_pass),
    rec_yards = sum(yards_gained[complete_pass == 1], na.rm = TRUE),
    rec_epa = sum(epa, na.rm = TRUE),
    .groups = "drop"
  )

# Combine (simple join by player ID)
rb_combined <- rb_rush %>%
  inner_join(rb_rec, by = c("rusher_player_id" = "receiver_player_id")) %>%
  filter(rush_att >= 50) %>%
  mutate(
    total_epa = rush_epa + rec_epa,
    rec_epa_pct = rec_epa / total_epa * 100
  ) %>%
  arrange(desc(total_epa))

print(rb_combined)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# RB rushing stats
rushes = pbp[(pbp["rusher_player_id"].notna()) & (pbp["play_type"] == "run")]
rb_rush = (rushes.groupby("rusher_player_id")
    .agg(
        rush_att=("epa", "count"),
        rush_yards=("yards_gained", "sum"),
        rush_epa=("epa", "sum")
    )
    .reset_index())

# RB receiving stats
receptions = pbp[(pbp["receiver_player_id"].notna()) & (pbp["play_type"] == "pass")]
rb_rec = (receptions.groupby("receiver_player_id")
    .agg(
        targets=("epa", "count"),
        receptions=("complete_pass", "sum"),
        rec_epa=("epa", "sum")
    )
    .reset_index())

# Combine
rb_combined = rb_rush.merge(rb_rec, left_on="rusher_player_id",
                             right_on="receiver_player_id", how="inner")
rb_combined = rb_combined[rb_combined["rush_att"] >= 50]
rb_combined["total_epa"] = rb_combined["rush_epa"] + rb_combined["rec_epa"]

print("RB Total EPA (Rush + Receiving):")
print(rb_combined.sort_values("total_epa", ascending=False).head(20))
Packages: nflfastR tidyverse nfl_data_py pandas
Rush Success by Formation
Analyze rushing efficiency from different offensive formations.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Rush success by formation
formation_rush <- pbp %>%
  filter(play_type == "run", !is.na(epa), !is.na(offense_formation)) %>%
  group_by(offense_formation) %>%
  summarize(
    attempts = n(),
    avg_yards = mean(yards_gained),
    epa = mean(epa),
    success_rate = mean(success) * 100,
    .groups = "drop"
  ) %>%
  filter(attempts >= 100) %>%
  arrange(desc(epa))

print(formation_rush)

# Shotgun vs Under Center rushing
pbp %>%
  filter(play_type == "run", !is.na(epa)) %>%
  mutate(
    shotgun = if_else(offense_formation == "SHOTGUN", "Shotgun", "Under Center")
  ) %>%
  group_by(shotgun) %>%
  summarize(
    attempts = n(),
    avg_yards = mean(yards_gained),
    epa = mean(epa),
    success_rate = mean(success) * 100
  )
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Rush success by formation
rushes = pbp[(pbp["play_type"] == "run") &
             (pbp["epa"].notna()) &
             (pbp["offense_formation"].notna())]

formation_rush = (rushes.groupby("offense_formation")
    .agg(
        attempts=("epa", "count"),
        avg_yards=("yards_gained", "mean"),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

formation_rush = formation_rush[formation_rush["attempts"] >= 100].sort_values(
    "epa", ascending=False)

print("Rushing Efficiency by Formation:")
print(formation_rush)
Packages: nflfastR tidyverse nfl_data_py pandas

Defensive Analytics

Analyze defensive performance, coverage, and pressure metrics

Defensive EPA by Unit
Rank defenses by EPA allowed per play.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Defensive EPA rankings
def_epa <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(defteam) %>%
  summarize(
    plays = n(),
    epa_allowed = mean(epa),
    pass_epa_allowed = mean(epa[play_type == "pass"]),
    rush_epa_allowed = mean(epa[play_type == "run"]),
    success_allowed = mean(success) * 100,
    .groups = "drop"
  ) %>%
  arrange(epa_allowed)  # Lower is better

print(def_epa)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Defensive EPA rankings
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

def_epa = (plays.groupby("defteam")
    .agg(
        plays=("epa", "count"),
        epa_allowed=("epa", "mean"),
        success_allowed=("success", lambda x: x.mean() * 100)
    )
    .reset_index()
    .sort_values("epa_allowed"))  # Lower is better

print("Defensive EPA Rankings (lower is better):")
print(def_epa)
Packages: nflfastR tidyverse nfl_data_py pandas
Pass Rush Metrics
Analyze pass rush effectiveness and sack rates.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Team pass rush metrics
pass_rush <- pbp %>%
  filter(play_type == "pass", !is.na(epa)) %>%
  group_by(defteam) %>%
  summarize(
    pass_plays_faced = n(),
    sacks = sum(sack),
    sack_rate = mean(sack) * 100,
    qb_hits = sum(qb_hit, na.rm = TRUE),
    epa_allowed = mean(epa),
    .groups = "drop"
  ) %>%
  arrange(desc(sack_rate))

print(pass_rush)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Pass rush metrics
pass_plays = pbp[(pbp["play_type"] == "pass") & (pbp["epa"].notna())]

pass_rush = (pass_plays.groupby("defteam")
    .agg(
        pass_plays=("epa", "count"),
        sacks=("sack", "sum"),
        sack_rate=("sack", lambda x: x.mean() * 100),
        epa_allowed=("epa", "mean")
    )
    .reset_index()
    .sort_values("sack_rate", ascending=False))

print("Team Pass Rush Effectiveness:")
print(pass_rush)
Packages: nflfastR tidyverse nfl_data_py pandas
Red Zone Defense
Analyze defensive efficiency in the red zone.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Red zone defense
rz_defense <- pbp %>%
  filter(yardline_100 <= 20, !is.na(epa)) %>%
  group_by(defteam) %>%
  summarize(
    rz_plays = n(),
    rz_epa_allowed = mean(epa),
    td_allowed_rate = mean(touchdown, na.rm = TRUE) * 100,
    success_allowed = mean(success) * 100,
    .groups = "drop"
  ) %>%
  arrange(rz_epa_allowed)

print(rz_defense)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Red zone defense
rz_plays = pbp[(pbp["yardline_100"] <= 20) & (pbp["epa"].notna())]

rz_defense = (rz_plays.groupby("defteam")
    .agg(
        rz_plays=("epa", "count"),
        rz_epa_allowed=("epa", "mean"),
        td_allowed_rate=("touchdown", lambda x: x.mean() * 100)
    )
    .reset_index()
    .sort_values("rz_epa_allowed"))

print("Red Zone Defense Rankings:")
print(rz_defense)
Packages: nflfastR tidyverse nfl_data_py pandas
Third Down Defense
Analyze defensive third down conversion rate allowed.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Third down defense
third_down_def <- pbp %>%
  filter(down == 3, !is.na(epa)) %>%
  group_by(defteam) %>%
  summarize(
    third_downs = n(),
    conversions_allowed = sum(first_down, na.rm = TRUE),
    conv_rate_allowed = mean(first_down, na.rm = TRUE) * 100,
    epa_allowed = mean(epa),
    .groups = "drop"
  ) %>%
  arrange(conv_rate_allowed)

print(third_down_def)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Third down defense
third_down = pbp[(pbp["down"] == 3) & (pbp["epa"].notna())]

third_down_def = (third_down.groupby("defteam")
    .agg(
        third_downs=("epa", "count"),
        conv_rate_allowed=("first_down", lambda x: x.mean() * 100),
        epa_allowed=("epa", "mean")
    )
    .reset_index()
    .sort_values("conv_rate_allowed"))

print("Third Down Defense (lower is better):")
print(third_down_def)
Packages: nflfastR tidyverse nfl_data_py pandas
Turnover Creation Rate
Analyze which defenses create the most turnovers.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Turnover analysis by defense
turnovers <- pbp %>%
  filter(!is.na(defteam)) %>%
  group_by(defteam) %>%
  summarize(
    plays = n(),
    interceptions = sum(interception, na.rm = TRUE),
    fumbles_forced = sum(fumble_lost, na.rm = TRUE),
    total_turnovers = interceptions + fumbles_forced,
    turnover_rate = total_turnovers / plays * 100,
    .groups = "drop"
  ) %>%
  arrange(desc(total_turnovers))

print(turnovers)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Turnover analysis
plays = pbp[pbp["defteam"].notna()]

turnovers = (plays.groupby("defteam")
    .agg(
        plays=("epa", "count"),
        interceptions=("interception", "sum"),
        fumbles=("fumble_lost", "sum")
    )
    .reset_index())

turnovers["total_turnovers"] = turnovers["interceptions"] + turnovers["fumbles"]
turnovers["turnover_rate"] = turnovers["total_turnovers"] / turnovers["plays"] * 100
turnovers = turnovers.sort_values("total_turnovers", ascending=False)

print("Defensive Turnovers Created:")
print(turnovers)
Packages: nflfastR tidyverse nfl_data_py pandas
Yards After Catch Allowed
Analyze defensive ability to limit yards after catch.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# YAC allowed analysis
yac_allowed <- pbp %>%
  filter(complete_pass == 1, !is.na(yards_after_catch)) %>%
  group_by(defteam) %>%
  summarize(
    completions_allowed = n(),
    total_yac_allowed = sum(yards_after_catch),
    avg_yac_allowed = mean(yards_after_catch),
    .groups = "drop"
  ) %>%
  arrange(avg_yac_allowed)

print(yac_allowed)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# YAC allowed
completions = pbp[(pbp["complete_pass"] == 1) & (pbp["yards_after_catch"].notna())]

yac_allowed = (completions.groupby("defteam")
    .agg(
        completions=("yards_after_catch", "count"),
        total_yac=("yards_after_catch", "sum"),
        avg_yac=("yards_after_catch", "mean")
    )
    .reset_index()
    .sort_values("avg_yac"))

print("YAC Allowed by Defense (lower is better):")
print(yac_allowed)
Packages: nflfastR tidyverse nfl_data_py pandas

Special Teams

Analyze kicking, punting, and return game performance

Field Goal Analysis
Analyze kicker accuracy by distance and conditions.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Field goal analysis
fg_analysis <- pbp %>%
  filter(play_type == "field_goal", !is.na(kick_distance)) %>%
  mutate(
    distance_bucket = case_when(
      kick_distance < 30 ~ "Under 30",
      kick_distance < 40 ~ "30-39",
      kick_distance < 50 ~ "40-49",
      TRUE ~ "50+"
    )
  ) %>%
  group_by(distance_bucket) %>%
  summarize(
    attempts = n(),
    makes = sum(field_goal_result == "made"),
    pct = mean(field_goal_result == "made") * 100,
    .groups = "drop"
  )

print(fg_analysis)

# Team kicking rankings
team_fg <- pbp %>%
  filter(play_type == "field_goal") %>%
  group_by(posteam) %>%
  summarize(
    attempts = n(),
    makes = sum(field_goal_result == "made"),
    pct = mean(field_goal_result == "made") * 100,
    avg_distance = mean(kick_distance, na.rm = TRUE)
  ) %>%
  arrange(desc(pct))

print(team_fg)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Field goal analysis
fgs = pbp[(pbp["play_type"] == "field_goal") & (pbp["kick_distance"].notna())]

def distance_bucket(d):
    if d < 30: return "Under 30"
    elif d < 40: return "30-39"
    elif d < 50: return "40-49"
    else: return "50+"

fgs["distance_bucket"] = fgs["kick_distance"].apply(distance_bucket)

fg_analysis = (fgs.groupby("distance_bucket")
    .agg(
        attempts=("field_goal_result", "count"),
        makes=("field_goal_result", lambda x: (x == "made").sum()),
        pct=("field_goal_result", lambda x: (x == "made").mean() * 100)
    )
    .reset_index())

print("Field Goal Accuracy by Distance:")
print(fg_analysis)
Packages: nflfastR tidyverse nfl_data_py pandas
Punt Analysis
Analyze punter performance and net yardage.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Punter analysis
punt_analysis <- pbp %>%
  filter(play_type == "punt", !is.na(kick_distance)) %>%
  group_by(punter_player_name) %>%
  summarize(
    punts = n(),
    avg_gross = mean(kick_distance),
    inside_20 = sum(yardline_100 <= 20, na.rm = TRUE),
    touchbacks = sum(touchback, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(punts >= 20) %>%
  arrange(desc(avg_gross))

print(punt_analysis)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Punt analysis
punts = pbp[(pbp["play_type"] == "punt") & (pbp["kick_distance"].notna())]

punt_analysis = (punts.groupby("punter_player_name")
    .agg(
        punts=("kick_distance", "count"),
        avg_gross=("kick_distance", "mean"),
        touchbacks=("touchback", "sum")
    )
    .reset_index())

punt_analysis = punt_analysis[punt_analysis["punts"] >= 20].sort_values(
    "avg_gross", ascending=False)

print("Punter Rankings:")
print(punt_analysis)
Packages: nflfastR tidyverse nfl_data_py pandas
Kickoff Return Analysis
Analyze kickoff return performance by team.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Kickoff return analysis
ko_returns <- pbp %>%
  filter(play_type == "kickoff", !is.na(return_yards)) %>%
  group_by(posteam) %>%
  summarize(
    returns = n(),
    total_yards = sum(return_yards),
    avg_return = mean(return_yards),
    touchbacks = sum(touchback, na.rm = TRUE),
    return_tds = sum(return_touchdown, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_return))

print(ko_returns)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Kickoff return analysis
kickoffs = pbp[(pbp["play_type"] == "kickoff") & (pbp["return_yards"].notna())]

ko_returns = (kickoffs.groupby("posteam")
    .agg(
        returns=("return_yards", "count"),
        total_yards=("return_yards", "sum"),
        avg_return=("return_yards", "mean"),
        tds=("return_touchdown", "sum")
    )
    .reset_index()
    .sort_values("avg_return", ascending=False))

print("Kickoff Return Rankings:")
print(ko_returns)
Packages: nflfastR tidyverse nfl_data_py pandas
Punt Return Analysis
Analyze punt return efficiency and big plays.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Punt return analysis
punt_returns <- pbp %>%
  filter(play_type == "punt", !is.na(return_yards), return_yards > 0) %>%
  group_by(posteam) %>%
  summarize(
    returns = n(),
    total_yards = sum(return_yards),
    avg_return = mean(return_yards),
    return_tds = sum(return_touchdown, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_return))

print(punt_returns)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Punt return analysis
punt_returns = pbp[(pbp["play_type"] == "punt") &
                   (pbp["return_yards"].notna()) &
                   (pbp["return_yards"] > 0)]

pr_analysis = (punt_returns.groupby("posteam")
    .agg(
        returns=("return_yards", "count"),
        total_yards=("return_yards", "sum"),
        avg_return=("return_yards", "mean")
    )
    .reset_index()
    .sort_values("avg_return", ascending=False))

print("Punt Return Rankings:")
print(pr_analysis)
Packages: nflfastR tidyverse nfl_data_py pandas
Special Teams EPA
Calculate EPA impact of special teams plays by type.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Special teams EPA by play type
st_epa <- pbp %>%
  filter(special_teams_play == 1, !is.na(epa)) %>%
  mutate(
    st_play_type = case_when(
      play_type == "kickoff" ~ "Kickoff",
      play_type == "punt" ~ "Punt",
      play_type == "field_goal" ~ "Field Goal",
      play_type == "extra_point" ~ "Extra Point",
      TRUE ~ "Other"
    )
  ) %>%
  group_by(posteam, st_play_type) %>%
  summarize(
    plays = n(),
    total_epa = sum(epa),
    avg_epa = mean(epa),
    .groups = "drop"
  )

# Team special teams rankings
team_st_epa <- pbp %>%
  filter(special_teams_play == 1, !is.na(epa)) %>%
  group_by(posteam) %>%
  summarize(
    total_st_epa = sum(epa),
    avg_st_epa = mean(epa),
    st_plays = n()
  ) %>%
  arrange(desc(total_st_epa))

print(team_st_epa)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter special teams plays
st_plays = pbp[(pbp["special_teams_play"] == 1) & (pbp["epa"].notna())]

# Map play types
def st_type(play):
    if play == "kickoff": return "Kickoff"
    elif play == "punt": return "Punt"
    elif play == "field_goal": return "Field Goal"
    elif play == "extra_point": return "Extra Point"
    else: return "Other"

st_plays["st_play_type"] = st_plays["play_type"].apply(st_type)

# Team special teams EPA
team_st_epa = (st_plays.groupby("posteam")
    .agg(
        total_st_epa=("epa", "sum"),
        avg_st_epa=("epa", "mean"),
        st_plays=("epa", "count")
    )
    .reset_index()
    .sort_values("total_st_epa", ascending=False))

print("Special Teams EPA Rankings:")
print(team_st_epa)
Packages: nflfastR tidyverse nfl_data_py pandas
Blocked Kicks Analysis
Analyze blocked field goals, punts, and extra points.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2020:2023)

# Find blocked kicks
blocked_kicks <- pbp %>%
  filter(
    play_type %in% c("field_goal", "punt", "extra_point"),
    str_detect(tolower(desc), "block")
  ) %>%
  mutate(
    blocked_type = play_type
  )

# Blocked kicks by team (blocking team)
team_blocks <- blocked_kicks %>%
  group_by(defteam, blocked_type) %>%
  summarize(blocks = n(), .groups = "drop") %>%
  pivot_wider(names_from = blocked_type, values_from = blocks, values_fill = 0) %>%
  mutate(total_blocks = rowSums(across(where(is.numeric)))) %>%
  arrange(desc(total_blocks))

print("Teams with Most Blocked Kicks (2020-2023):")
print(team_blocks)

# Blocked kicks allowed
blocks_allowed <- blocked_kicks %>%
  group_by(posteam, blocked_type) %>%
  summarize(blocked = n(), .groups = "drop") %>%
  pivot_wider(names_from = blocked_type, values_from = blocked, values_fill = 0) %>%
  mutate(total_blocked = rowSums(across(where(is.numeric)))) %>%
  arrange(desc(total_blocked))

print("\nTeams with Most Kicks Blocked Against:")
print(blocks_allowed)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])

# Find blocked kicks
st_plays = pbp[pbp["play_type"].isin(["field_goal", "punt", "extra_point"])]
blocked = st_plays[st_plays["desc"].str.lower().str.contains("block", na=False)]

# Blocked kicks by blocking team
team_blocks = (blocked.groupby(["defteam", "play_type"])
    .size()
    .reset_index(name="blocks"))

# Pivot to get blocks by type
blocks_pivot = team_blocks.pivot_table(
    index="defteam",
    columns="play_type",
    values="blocks",
    fill_value=0
).reset_index()

blocks_pivot["total_blocks"] = blocks_pivot.select_dtypes(include="number").sum(axis=1)
blocks_pivot = blocks_pivot.sort_values("total_blocks", ascending=False)

print("Teams with Most Blocked Kicks (2020-2023):")
print(blocks_pivot.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas

Betting Models

Build predictive models for spreads, totals, and player props

Simple Spread Prediction Model
Build a basic spread prediction model using team EPA ratings.
Advanced
library(nflfastR)
library(tidyverse)

# Load data
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)

# Calculate team ratings
team_ratings <- pbp %>%
  filter(!is.na(epa)) %>%
  group_by(posteam) %>%
  summarize(off_epa = mean(epa)) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa)) %>%
      group_by(defteam) %>%
      summarize(def_epa = mean(epa)),
    by = c("posteam" = "defteam")
  ) %>%
  mutate(
    net_epa = off_epa - def_epa,
    power_rating = net_epa * 3.5  # Convert to points
  )

# Create matchup predictions
games <- schedules %>%
  filter(!is.na(result)) %>%
  left_join(team_ratings, by = c("home_team" = "posteam")) %>%
  rename(home_power = power_rating) %>%
  left_join(team_ratings %>% select(posteam, power_rating),
            by = c("away_team" = "posteam")) %>%
  rename(away_power = power_rating) %>%
  mutate(
    pred_spread = away_power - home_power - 2.5,  # HFA
    actual_spread = -result,
    error = pred_spread - actual_spread,
    correct_side = (pred_spread > 0 & result < 0) |
                   (pred_spread < 0 & result > 0)
  )

# Model performance
cat("Mean Absolute Error:", mean(abs(games$error)), "\n")
cat("Correct Side %:", mean(games$correct_side) * 100, "%\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np

# Load data
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])

# Calculate team ratings
plays = pbp[pbp["epa"].notna()]

off_epa = plays.groupby("posteam")["epa"].mean().reset_index()
off_epa.columns = ["team", "off_epa"]

def_epa = plays.groupby("defteam")["epa"].mean().reset_index()
def_epa.columns = ["team", "def_epa"]

team_ratings = off_epa.merge(def_epa, on="team")
team_ratings["net_epa"] = team_ratings["off_epa"] - team_ratings["def_epa"]
team_ratings["power_rating"] = team_ratings["net_epa"] * 3.5

# Create matchup predictions
games = schedules[schedules["result"].notna()].copy()
games = games.merge(
    team_ratings[["team", "power_rating"]],
    left_on="home_team", right_on="team"
).rename(columns={"power_rating": "home_power"})
games = games.merge(
    team_ratings[["team", "power_rating"]],
    left_on="away_team", right_on="team"
).rename(columns={"power_rating": "away_power"})

games["pred_spread"] = games["away_power"] - games["home_power"] - 2.5
games["actual_spread"] = -games["result"]
games["error"] = games["pred_spread"] - games["actual_spread"]
games["correct_side"] = ((games["pred_spread"] > 0) & (games["result"] < 0)) | \
                        ((games["pred_spread"] < 0) & (games["result"] > 0))

print(f"Mean Absolute Error: {games['error'].abs().mean():.2f}")
print(f"Correct Side %: {games['correct_side'].mean() * 100:.1f}%")
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Over/Under Totals Model
Predict game totals using pace and efficiency metrics.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)
schedules <- load_schedules(2023)

# Calculate team scoring metrics
team_scoring <- pbp %>%
  filter(!is.na(epa)) %>%
  group_by(game_id, posteam) %>%
  summarize(
    plays = n(),
    points_scored = sum(touchdown * 7, na.rm = TRUE) +
                    sum(field_goal_result == "made" * 3, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(posteam) %>%
  summarize(
    avg_plays = mean(plays),
    avg_points = mean(points_scored),
    pace = avg_plays / 60  # plays per minute proxy
  )

# Create totals predictions
games <- schedules %>%
  filter(!is.na(result), !is.na(total)) %>%
  left_join(team_scoring, by = c("home_team" = "posteam")) %>%
  rename(home_points = avg_points, home_pace = pace) %>%
  left_join(team_scoring %>% select(posteam, avg_points, pace),
            by = c("away_team" = "posteam")) %>%
  rename(away_points = avg_points, away_pace = pace) %>%
  mutate(
    pred_total = (home_points + away_points) *
                 ((home_pace + away_pace) / 2),
    actual_total = home_score + away_score,
    over_under = if_else(actual_total > total, "Over", "Under"),
    pred_over = pred_total > total,
    actual_over = actual_total > total,
    correct = pred_over == actual_over
  )

# Model performance
cat("Correct %:", mean(games$correct) * 100, "%\n")
cat("MAE:", mean(abs(games$pred_total - games$actual_total)), "\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])

# Calculate team scoring metrics
plays = pbp[pbp["epa"].notna()]

game_scoring = (plays.groupby(["game_id", "posteam"])
    .agg(
        plays=("epa", "count"),
        touchdowns=("touchdown", "sum")
    )
    .reset_index())
game_scoring["points_approx"] = game_scoring["touchdowns"] * 7

team_scoring = (game_scoring.groupby("posteam")
    .agg(
        avg_plays=("plays", "mean"),
        avg_points=("points_approx", "mean")
    )
    .reset_index())
team_scoring["pace"] = team_scoring["avg_plays"] / 60

# Create totals predictions
games = schedules[schedules["result"].notna() & schedules["total"].notna()].copy()
games = games.merge(
    team_scoring[["posteam", "avg_points", "pace"]],
    left_on="home_team", right_on="posteam"
).rename(columns={"avg_points": "home_points", "pace": "home_pace"})
games = games.merge(
    team_scoring[["posteam", "avg_points", "pace"]],
    left_on="away_team", right_on="posteam"
).rename(columns={"avg_points": "away_points", "pace": "away_pace"})

games["pred_total"] = (games["home_points"] + games["away_points"]) * \
                      ((games["home_pace"] + games["away_pace"]) / 2)
games["actual_total"] = games["home_score"] + games["away_score"]
games["correct"] = (games["pred_total"] > games["total"]) == \
                   (games["actual_total"] > games["total"])

print(f"Correct %: {games['correct'].mean() * 100:.1f}%")
print(f"MAE: {(games['pred_total'] - games['actual_total']).abs().mean():.1f}")
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Player Prop Projections
Project player receiving yards using target share and matchup data.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate receiver baseline stats
receiver_stats <- pbp %>%
  filter(!is.na(receiver_player_id), play_type == "pass") %>%
  group_by(receiver_player_id, receiver_player_name, posteam) %>%
  summarize(
    targets = n(),
    receptions = sum(complete_pass),
    yards = sum(yards_gained, na.rm = TRUE),
    tds = sum(touchdown),
    avg_depth = mean(air_yards, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(targets >= 50) %>%
  mutate(
    catch_rate = receptions / targets,
    yards_per_target = yards / targets,
    yards_per_reception = yards / receptions
  )

# Calculate team target rates for each receiver
team_targets <- pbp %>%
  filter(play_type == "pass", !is.na(receiver_player_id)) %>%
  group_by(posteam) %>%
  summarize(team_targets = n())

receiver_share <- receiver_stats %>%
  left_join(team_targets, by = "posteam") %>%
  mutate(target_share = targets / team_targets)

# Project yards for next game
# Assume team throws ~35 passes per game
receiver_share <- receiver_share %>%
  mutate(
    proj_targets = target_share * 35,
    proj_yards = proj_targets * yards_per_target
  ) %>%
  arrange(desc(proj_yards))

print(receiver_share %>%
        select(receiver_player_name, target_share,
               yards_per_target, proj_yards) %>%
        head(20))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Calculate receiver baseline stats
pass_plays = pbp[(pbp["receiver_player_id"].notna()) &
                 (pbp["play_type"] == "pass")]

receiver_stats = (pass_plays.groupby(
    ["receiver_player_id", "receiver_player_name", "posteam"])
    .agg(
        targets=("play_id", "count"),
        receptions=("complete_pass", "sum"),
        yards=("yards_gained", "sum"),
        tds=("touchdown", "sum"),
        avg_depth=("air_yards", "mean")
    )
    .reset_index())

receiver_stats = receiver_stats[receiver_stats["targets"] >= 50]
receiver_stats["catch_rate"] = receiver_stats["receptions"] / receiver_stats["targets"]
receiver_stats["yards_per_target"] = receiver_stats["yards"] / receiver_stats["targets"]

# Calculate team target rates
team_targets = (pass_plays.groupby("posteam")
    .size().reset_index(name="team_targets"))

receiver_share = receiver_stats.merge(team_targets, on="posteam")
receiver_share["target_share"] = receiver_share["targets"] / receiver_share["team_targets"]

# Project yards (35 passes per game assumption)
receiver_share["proj_targets"] = receiver_share["target_share"] * 35
receiver_share["proj_yards"] = receiver_share["proj_targets"] * receiver_share["yards_per_target"]

result = receiver_share.nlargest(20, "proj_yards")[
    ["receiver_player_name", "target_share", "yards_per_target", "proj_yards"]
]
print(result)
Packages: nflfastR tidyverse nfl_data_py pandas
Elo Rating System
Build and track Elo ratings for all NFL teams.
Advanced
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2020:2023)

# Initialize Elo ratings
elo <- setNames(rep(1500, 32),
                unique(c(schedules$home_team, schedules$away_team)))
k_factor <- 20

# Calculate expected score
expected <- function(r1, r2) 1 / (1 + 10^((r2 - r1)/400))

# Elo calculation function
update_elo <- function(winner_rating, loser_rating, margin) {
  exp_win <- expected(winner_rating, loser_rating)
  mov_mult <- log(abs(margin) + 1) * 2.2 / ((winner_rating - loser_rating) * 0.001 + 2.2)
  change <- k_factor * mov_mult * (1 - exp_win)
  return(change)
}

# Process games
games <- schedules %>%
  filter(!is.na(result)) %>%
  arrange(game_id)

elo_history <- list()
for (i in seq_len(nrow(games))) {
  g <- games[i,]
  home_elo <- elo[g$home_team] + 55  # HFA
  away_elo <- elo[g$away_team]

  if (g$result > 0) {  # Home win
    change <- update_elo(home_elo, away_elo, g$result)
    elo[g$home_team] <- elo[g$home_team] + change
    elo[g$away_team] <- elo[g$away_team] - change
  } else {  # Away win
    change <- update_elo(away_elo, home_elo, -g$result)
    elo[g$away_team] <- elo[g$away_team] + change
    elo[g$home_team] <- elo[g$home_team] - change
  }
}

# Current Elo rankings
elo_df <- data.frame(team = names(elo), elo = elo) %>%
  arrange(desc(elo))
print(elo_df)
import nfl_data_py as nfl
import pandas as pd
import numpy as np

schedules = nfl.import_schedules([2020, 2021, 2022, 2023])

# Initialize Elo ratings
teams = pd.concat([schedules["home_team"], schedules["away_team"]]).unique()
elo = {team: 1500 for team in teams}
k_factor = 20

def expected(r1, r2):
    return 1 / (1 + 10**((r2 - r1)/400))

def update_elo(winner_rating, loser_rating, margin):
    exp_win = expected(winner_rating, loser_rating)
    mov_mult = np.log(abs(margin) + 1) * 2.2 / ((winner_rating - loser_rating) * 0.001 + 2.2)
    change = k_factor * mov_mult * (1 - exp_win)
    return change

# Process games
games = schedules[schedules["result"].notna()].sort_values("game_id")

for _, g in games.iterrows():
    home_elo = elo[g["home_team"]] + 55  # Home field advantage
    away_elo = elo[g["away_team"]]

    if g["result"] > 0:  # Home win
        change = update_elo(home_elo, away_elo, g["result"])
        elo[g["home_team"]] += change
        elo[g["away_team"]] -= change
    else:  # Away win
        change = update_elo(away_elo, home_elo, -g["result"])
        elo[g["away_team"]] += change
        elo[g["home_team"]] -= change

# Current Elo rankings
elo_df = pd.DataFrame({"team": list(elo.keys()), "elo": list(elo.values())})
elo_df = elo_df.sort_values("elo", ascending=False).reset_index(drop=True)
print("Current Elo Rankings:")
print(elo_df)
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Power Rankings Model
Create composite power rankings using multiple metrics.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)
schedules <- load_schedules(2023)

# Calculate multiple components
team_metrics <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(posteam) %>%
  summarize(
    off_epa = mean(epa),
    success_rate = mean(success) * 100,
    .groups = "drop"
  ) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
      group_by(defteam) %>%
      summarize(
        def_epa = mean(epa),
        def_success_allowed = mean(success) * 100
      ),
    by = c("posteam" = "defteam")
  )

# Get win-loss record
records <- schedules %>%
  filter(!is.na(result)) %>%
  pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
  mutate(
    win = (location == "home_team" & result > 0) | (location == "away_team" & result < 0)
  ) %>%
  group_by(team) %>%
  summarize(wins = sum(win), games = n(), win_pct = wins/games)

# Combine into power ranking
power_rankings <- team_metrics %>%
  left_join(records, by = c("posteam" = "team")) %>%
  mutate(
    # Normalize each metric to 0-100
    off_score = (off_epa - min(off_epa)) / (max(off_epa) - min(off_epa)) * 100,
    def_score = 100 - (def_epa - min(def_epa)) / (max(def_epa) - min(def_epa)) * 100,
    win_score = win_pct * 100,

    # Composite (40% offense, 40% defense, 20% record)
    power_rating = off_score * 0.4 + def_score * 0.4 + win_score * 0.2
  ) %>%
  arrange(desc(power_rating)) %>%
  mutate(rank = row_number())

print(power_rankings %>% select(posteam, rank, power_rating, off_epa, def_epa, win_pct))
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])

# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

off_stats = plays.groupby("posteam").agg(
    off_epa=("epa", "mean"),
    success_rate=("success", lambda x: x.mean() * 100)
).reset_index()

def_stats = plays.groupby("defteam").agg(
    def_epa=("epa", "mean")
).reset_index()

team_metrics = off_stats.merge(def_stats, left_on="posteam", right_on="defteam")

# Get win-loss records
games = schedules[schedules["result"].notna()]
home_wins = games[games["result"] > 0].groupby("home_team").size()
away_wins = games[games["result"] < 0].groupby("away_team").size()
total_games = games.groupby("home_team").size() + games.groupby("away_team").size()
total_wins = home_wins.add(away_wins, fill_value=0)
win_pct = (total_wins / total_games).reset_index()
win_pct.columns = ["team", "win_pct"]

# Combine
power_rankings = team_metrics.merge(win_pct, left_on="posteam", right_on="team")

# Normalize
def normalize(col):
    return (col - col.min()) / (col.max() - col.min()) * 100

power_rankings["off_score"] = normalize(power_rankings["off_epa"])
power_rankings["def_score"] = 100 - normalize(power_rankings["def_epa"])
power_rankings["win_score"] = power_rankings["win_pct"] * 100
power_rankings["power_rating"] = (power_rankings["off_score"] * 0.4 +
                                   power_rankings["def_score"] * 0.4 +
                                   power_rankings["win_score"] * 0.2)

power_rankings = power_rankings.sort_values("power_rating", ascending=False).reset_index(drop=True)
power_rankings["rank"] = range(1, len(power_rankings) + 1)

print(power_rankings[["posteam", "rank", "power_rating", "off_epa", "def_epa", "win_pct"]])
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Home Field Advantage Analysis
Quantify home field advantage across the NFL.
Intermediate
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2019:2023)

# Overall HFA
hfa_overall <- schedules %>%
  filter(!is.na(result)) %>%
  summarize(
    games = n(),
    home_wins = sum(result > 0),
    ties = sum(result == 0),
    away_wins = sum(result < 0),
    home_win_pct = mean(result > 0) * 100,
    avg_home_margin = mean(result)
  )

print("Overall Home Field Advantage:")
print(hfa_overall)

# HFA by team
team_hfa <- schedules %>%
  filter(!is.na(result)) %>%
  group_by(home_team) %>%
  summarize(
    home_games = n(),
    home_wins = sum(result > 0),
    home_win_pct = mean(result > 0) * 100,
    avg_margin = mean(result),
    .groups = "drop"
  ) %>%
  arrange(desc(home_win_pct))

print("\nHFA by Stadium:")
print(team_hfa)

# HFA by season (trend over time)
hfa_by_season <- schedules %>%
  filter(!is.na(result)) %>%
  group_by(season) %>%
  summarize(
    home_win_pct = mean(result > 0) * 100,
    avg_margin = mean(result)
  )

print("\nHFA Trend by Season:")
print(hfa_by_season)
import nfl_data_py as nfl
import pandas as pd

schedules = nfl.import_schedules([2019, 2020, 2021, 2022, 2023])

# Overall HFA
games = schedules[schedules["result"].notna()]

hfa_overall = {
    "games": len(games),
    "home_wins": (games["result"] > 0).sum(),
    "away_wins": (games["result"] < 0).sum(),
    "home_win_pct": (games["result"] > 0).mean() * 100,
    "avg_home_margin": games["result"].mean()
}

print("Overall Home Field Advantage:")
print(pd.DataFrame([hfa_overall]))

# HFA by team
team_hfa = (games.groupby("home_team")
    .agg(
        home_games=("result", "count"),
        home_wins=("result", lambda x: (x > 0).sum()),
        home_win_pct=("result", lambda x: (x > 0).mean() * 100),
        avg_margin=("result", "mean")
    )
    .reset_index()
    .sort_values("home_win_pct", ascending=False))

print("\nHFA by Stadium:")
print(team_hfa)

# HFA by season
hfa_by_season = (games.groupby("season")
    .agg(
        home_win_pct=("result", lambda x: (x > 0).mean() * 100),
        avg_margin=("result", "mean")
    )
    .reset_index())

print("\nHFA Trend by Season:")
print(hfa_by_season)
Packages: nflfastR tidyverse nfl_data_py pandas
Key Number Analysis
Analyze final score margins and key betting numbers.
Intermediate
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2018:2023)

# Calculate margin distribution
margins <- schedules %>%
  filter(!is.na(result)) %>%
  mutate(margin = abs(result)) %>%
  group_by(margin) %>%
  summarize(games = n()) %>%
  mutate(pct = games / sum(games) * 100) %>%
  arrange(desc(games))

# Top key numbers
key_numbers <- margins %>%
  head(15) %>%
  mutate(cumulative_pct = cumsum(pct))

print("Top 15 Final Margins:")
print(key_numbers)

# Games landing on 3 and 7
on_3_or_7 <- schedules %>%
  filter(!is.na(result)) %>%
  mutate(
    on_3 = abs(result) == 3,
    on_7 = abs(result) == 7,
    on_key = abs(result) %in% c(3, 7, 10, 14)
  ) %>%
  summarize(
    total_games = n(),
    on_3 = sum(on_3),
    on_7 = sum(on_7),
    on_3_pct = mean(on_3) * 100,
    on_7_pct = mean(on_7) * 100,
    any_key_pct = mean(on_key) * 100
  )

print("\nKey Number Summary:")
print(on_3_or_7)
import nfl_data_py as nfl
import pandas as pd

schedules = nfl.import_schedules([2018, 2019, 2020, 2021, 2022, 2023])

# Calculate margin distribution
games = schedules[schedules["result"].notna()].copy()
games["margin"] = games["result"].abs()

margins = (games.groupby("margin")
    .size()
    .reset_index(name="games")
    .sort_values("games", ascending=False))

margins["pct"] = margins["games"] / margins["games"].sum() * 100
margins["cumulative_pct"] = margins["pct"].cumsum()

print("Top 15 Final Margins:")
print(margins.head(15))

# Key number analysis
games["on_3"] = games["margin"] == 3
games["on_7"] = games["margin"] == 7
games["on_key"] = games["margin"].isin([3, 7, 10, 14])

key_summary = {
    "total_games": len(games),
    "on_3": games["on_3"].sum(),
    "on_7": games["on_7"].sum(),
    "on_3_pct": games["on_3"].mean() * 100,
    "on_7_pct": games["on_7"].mean() * 100,
    "any_key_pct": games["on_key"].mean() * 100
}

print("\nKey Number Summary:")
print(pd.DataFrame([key_summary]))
Packages: nflfastR tidyverse nfl_data_py pandas
Closing Line Value Analysis
Track line movement and identify CLV opportunities.
Advanced
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2023)

# Analyze spread movement patterns
spread_analysis <- schedules %>%
  filter(!is.na(result), !is.na(spread_line)) %>%
  mutate(
    # Positive result means home win
    home_covered = result + spread_line > 0,
    favorite = if_else(spread_line < 0, "Home", "Away"),
    spread_bucket = case_when(
      abs(spread_line) <= 3 ~ "Pick/FG",
      abs(spread_line) <= 7 ~ "4-7 pts",
      abs(spread_line) <= 10 ~ "8-10 pts",
      TRUE ~ "10+ pts"
    )
  )

# ATS performance by spread size
ats_by_spread <- spread_analysis %>%
  group_by(spread_bucket) %>%
  summarize(
    games = n(),
    home_cover_pct = mean(home_covered) * 100,
    avg_margin = mean(result + spread_line),
    .groups = "drop"
  )

print("ATS Performance by Spread Size:")
print(ats_by_spread)

# Favorite vs underdog ATS
fav_vs_dog <- spread_analysis %>%
  mutate(
    favorite_covered = (favorite == "Home" & home_covered) |
                       (favorite == "Away" & !home_covered)
  ) %>%
  summarize(
    games = n(),
    favorite_cover_pct = mean(favorite_covered) * 100,
    dog_cover_pct = (1 - mean(favorite_covered)) * 100
  )

print("\nFavorite vs Underdog ATS:")
print(fav_vs_dog)
import nfl_data_py as nfl
import pandas as pd
import numpy as np

schedules = nfl.import_schedules([2023])

# Analyze spread patterns
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()

games["home_covered"] = games["result"] + games["spread_line"] > 0
games["favorite"] = np.where(games["spread_line"] < 0, "Home", "Away")

def spread_bucket(s):
    s = abs(s)
    if s <= 3: return "Pick/FG"
    elif s <= 7: return "4-7 pts"
    elif s <= 10: return "8-10 pts"
    else: return "10+ pts"

games["spread_bucket"] = games["spread_line"].apply(spread_bucket)

# ATS performance by spread size
ats_by_spread = (games.groupby("spread_bucket")
    .agg(
        games=("home_covered", "count"),
        home_cover_pct=("home_covered", lambda x: x.mean() * 100),
        avg_margin=("result", lambda x: (x + games.loc[x.index, "spread_line"]).mean())
    )
    .reset_index())

print("ATS Performance by Spread Size:")
print(ats_by_spread)

# Favorite vs underdog
games["favorite_covered"] = ((games["favorite"] == "Home") & games["home_covered"]) | \
                            ((games["favorite"] == "Away") & ~games["home_covered"])

fav_cover_pct = games["favorite_covered"].mean() * 100
print(f"\nFavorite cover %: {fav_cover_pct:.1f}%")
print(f"Underdog cover %: {100 - fav_cover_pct:.1f}%")
Packages: nflfastR tidyverse nfl_data_py pandas numpy
First Half vs Full Game Analysis
Compare first half and full game betting results.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)
schedules <- load_schedules(2023)

# Calculate first half scores
first_half <- pbp %>%
  filter(qtr <= 2) %>%
  group_by(game_id) %>%
  summarize(
    home_1h = sum(home_score[play_id == max(play_id)]),
    away_1h = sum(away_score[play_id == max(play_id)]),
    .groups = "drop"
  )

# Get final scores
final_scores <- schedules %>%
  filter(!is.na(result)) %>%
  select(game_id, home_score, away_score, spread_line, result)

# Combine
game_halves <- final_scores %>%
  left_join(first_half, by = "game_id") %>%
  mutate(
    home_2h = home_score - home_1h,
    away_2h = away_score - away_1h,
    result_1h = home_1h - away_1h,
    result_2h = home_2h - away_2h,

    # Compare halves
    home_stronger_2h = result_2h > result_1h,
    halves_same_winner = (result_1h > 0) == (result > 0)
  )

# Analysis
half_analysis <- game_halves %>%
  summarize(
    games = n(),
    avg_1h_margin = mean(result_1h),
    avg_2h_margin = mean(result_2h),
    same_winner_pct = mean(halves_same_winner, na.rm = TRUE) * 100,
    home_stronger_2h_pct = mean(home_stronger_2h, na.rm = TRUE) * 100
  )

print("First Half vs Second Half Analysis:")
print(half_analysis)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])

# Calculate first half scores
first_half = (pbp[pbp["qtr"] <= 2]
    .groupby("game_id")
    .agg(
        home_1h=("home_score", "last"),
        away_1h=("away_score", "last")
    )
    .reset_index())

# Get final scores
final_scores = schedules[schedules["result"].notna()][
    ["game_id", "home_score", "away_score", "spread_line", "result"]
]

# Combine
game_halves = final_scores.merge(first_half, on="game_id")
game_halves["home_2h"] = game_halves["home_score"] - game_halves["home_1h"]
game_halves["away_2h"] = game_halves["away_score"] - game_halves["away_1h"]
game_halves["result_1h"] = game_halves["home_1h"] - game_halves["away_1h"]
game_halves["result_2h"] = game_halves["home_2h"] - game_halves["away_2h"]

game_halves["same_winner"] = (game_halves["result_1h"] > 0) == (game_halves["result"] > 0)
game_halves["home_stronger_2h"] = game_halves["result_2h"] > game_halves["result_1h"]

# Analysis
half_analysis = {
    "games": len(game_halves),
    "avg_1h_margin": game_halves["result_1h"].mean(),
    "avg_2h_margin": game_halves["result_2h"].mean(),
    "same_winner_pct": game_halves["same_winner"].mean() * 100,
    "home_stronger_2h_pct": game_halves["home_stronger_2h"].mean() * 100
}

print("First Half vs Second Half Analysis:")
print(pd.DataFrame([half_analysis]))
Packages: nflfastR tidyverse nfl_data_py pandas
Division Game Trends
Analyze betting patterns for division rivalry games.
Intermediate
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2019:2023)

# Define divisions
divisions <- list(
  AFC_East = c("BUF", "MIA", "NE", "NYJ"),
  AFC_North = c("BAL", "CIN", "CLE", "PIT"),
  AFC_South = c("HOU", "IND", "JAX", "TEN"),
  AFC_West = c("DEN", "KC", "LV", "LAC"),
  NFC_East = c("DAL", "NYG", "PHI", "WAS"),
  NFC_North = c("CHI", "DET", "GB", "MIN"),
  NFC_South = c("ATL", "CAR", "NO", "TB"),
  NFC_West = c("ARI", "LAR", "SF", "SEA")
)

# Function to find division
get_division <- function(team) {
  for (div in names(divisions)) {
    if (team %in% divisions[[div]]) return(div)
  }
  return(NA)
}

# Analyze division games
div_games <- schedules %>%
  filter(!is.na(result), !is.na(spread_line)) %>%
  rowwise() %>%
  mutate(
    home_div = get_division(home_team),
    away_div = get_division(away_team),
    is_division_game = home_div == away_div
  ) %>%
  ungroup()

# Compare division vs non-division
div_comparison <- div_games %>%
  group_by(is_division_game) %>%
  summarize(
    games = n(),
    home_win_pct = mean(result > 0) * 100,
    home_cover_pct = mean(result + spread_line > 0) * 100,
    avg_total = mean(home_score + away_score),
    avg_margin = mean(abs(result)),
    .groups = "drop"
  )

print("Division vs Non-Division Game Analysis:")
print(div_comparison)
import nfl_data_py as nfl
import pandas as pd

schedules = nfl.import_schedules([2019, 2020, 2021, 2022, 2023])

# Define divisions
divisions = {
    "AFC_East": ["BUF", "MIA", "NE", "NYJ"],
    "AFC_North": ["BAL", "CIN", "CLE", "PIT"],
    "AFC_South": ["HOU", "IND", "JAX", "TEN"],
    "AFC_West": ["DEN", "KC", "LV", "LAC"],
    "NFC_East": ["DAL", "NYG", "PHI", "WAS"],
    "NFC_North": ["CHI", "DET", "GB", "MIN"],
    "NFC_South": ["ATL", "CAR", "NO", "TB"],
    "NFC_West": ["ARI", "LAR", "SF", "SEA"]
}

def get_division(team):
    for div, teams in divisions.items():
        if team in teams:
            return div
    return None

# Analyze division games
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games["home_div"] = games["home_team"].apply(get_division)
games["away_div"] = games["away_team"].apply(get_division)
games["is_division_game"] = games["home_div"] == games["away_div"]

# Compare division vs non-division
div_comparison = (games.groupby("is_division_game")
    .agg(
        games=("result", "count"),
        home_win_pct=("result", lambda x: (x > 0).mean() * 100),
        home_cover_pct=("result", lambda x: ((x + games.loc[x.index, "spread_line"]) > 0).mean() * 100),
        avg_total=("home_score", lambda x: (x + games.loc[x.index, "away_score"]).mean()),
        avg_margin=("result", lambda x: x.abs().mean())
    )
    .reset_index())

div_comparison["is_division_game"] = div_comparison["is_division_game"].map({True: "Division", False: "Non-Division"})
print("Division vs Non-Division Game Analysis:")
print(div_comparison)
Packages: nflfastR tidyverse nfl_data_py pandas

Fantasy Football

Fantasy football analysis, projections, and trade evaluations

Fantasy Points Calculation
Calculate fantasy points for all skill position players.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# PPR Fantasy Points calculation
# Passing: 0.04 per yard, 4 per TD, -2 per INT
# Rushing: 0.1 per yard, 6 per TD
# Receiving: 0.1 per yard, 6 per TD, 1 per reception

# QB Fantasy Points
qb_fantasy <- pbp %>%
  filter(!is.na(passer_player_id)) %>%
  group_by(passer_player_id, passer_player_name) %>%
  summarize(
    pass_yards = sum(passing_yards, na.rm = TRUE),
    pass_tds = sum(pass_touchdown, na.rm = TRUE),
    ints = sum(interception),
    fantasy_pts = pass_yards * 0.04 + pass_tds * 4 - ints * 2,
    .groups = "drop"
  ) %>%
  arrange(desc(fantasy_pts))

# RB/WR/TE Fantasy Points (PPR)
skill_fantasy <- pbp %>%
  filter(!is.na(rusher_player_id) | !is.na(receiver_player_id)) %>%
  mutate(
    player_id = coalesce(rusher_player_id, receiver_player_id),
    player_name = coalesce(rusher_player_name, receiver_player_name)
  ) %>%
  group_by(player_id, player_name) %>%
  summarize(
    rush_yards = sum(rushing_yards, na.rm = TRUE),
    rush_tds = sum(rush_touchdown, na.rm = TRUE),
    receptions = sum(complete_pass, na.rm = TRUE),
    rec_yards = sum(receiving_yards, na.rm = TRUE),
    rec_tds = sum(pass_touchdown[!is.na(receiver_player_id)], na.rm = TRUE),
    fantasy_pts = rush_yards * 0.1 + rush_tds * 6 +
                  receptions * 1 + rec_yards * 0.1 + rec_tds * 6,
    .groups = "drop"
  ) %>%
  filter(fantasy_pts > 50) %>%
  arrange(desc(fantasy_pts))

print(skill_fantasy %>% head(30))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# QB Fantasy Points
qb_plays = pbp[pbp["passer_player_id"].notna()]
qb_fantasy = (qb_plays.groupby(["passer_player_id", "passer_player_name"])
    .agg(
        pass_yards=("passing_yards", "sum"),
        pass_tds=("pass_touchdown", "sum"),
        ints=("interception", "sum")
    )
    .reset_index())
qb_fantasy["fantasy_pts"] = (qb_fantasy["pass_yards"] * 0.04 +
                              qb_fantasy["pass_tds"] * 4 -
                              qb_fantasy["ints"] * 2)
qb_fantasy = qb_fantasy.sort_values("fantasy_pts", ascending=False)

# Skill position fantasy points (simplified)
rush_plays = pbp[pbp["rusher_player_id"].notna()]
rush_pts = (rush_plays.groupby(["rusher_player_id", "rusher_player_name"])
    .agg(
        rush_yards=("rushing_yards", "sum"),
        rush_tds=("rush_touchdown", "sum")
    )
    .reset_index())
rush_pts.columns = ["player_id", "player_name", "rush_yards", "rush_tds"]

rec_plays = pbp[pbp["receiver_player_id"].notna()]
rec_pts = (rec_plays.groupby(["receiver_player_id", "receiver_player_name"])
    .agg(
        receptions=("complete_pass", "sum"),
        rec_yards=("receiving_yards", "sum"),
        rec_tds=("pass_touchdown", "sum")
    )
    .reset_index())
rec_pts.columns = ["player_id", "player_name", "receptions", "rec_yards", "rec_tds"]

# Combine
skill_fantasy = rush_pts.merge(rec_pts, on=["player_id", "player_name"], how="outer").fillna(0)
skill_fantasy["fantasy_pts"] = (skill_fantasy["rush_yards"] * 0.1 +
                                 skill_fantasy["rush_tds"] * 6 +
                                 skill_fantasy["receptions"] * 1 +
                                 skill_fantasy["rec_yards"] * 0.1 +
                                 skill_fantasy["rec_tds"] * 6)
skill_fantasy = skill_fantasy[skill_fantasy["fantasy_pts"] > 50].sort_values("fantasy_pts", ascending=False)

print(skill_fantasy.head(30))
Packages: nflfastR tidyverse nfl_data_py pandas
Target Share Analysis
Analyze target distribution within offenses for fantasy value.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate target share by team
target_share <- pbp %>%
  filter(play_type == "pass", !is.na(receiver_player_id)) %>%
  group_by(posteam) %>%
  mutate(team_targets = n()) %>%
  group_by(posteam, receiver_player_id, receiver_player_name, team_targets) %>%
  summarize(
    targets = n(),
    air_yards = sum(air_yards, na.rm = TRUE),
    receptions = sum(complete_pass),
    yards = sum(yards_gained, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    target_share = targets / team_targets,
    air_yard_share = air_yards / sum(air_yards),
    wopr = target_share * 1.5 + air_yard_share * 0.7  # Weighted Opportunity Rating
  ) %>%
  arrange(desc(wopr))

# Top target share players
print(target_share %>%
        select(receiver_player_name, posteam, target_share,
               air_yard_share, wopr) %>%
        head(25))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter to pass plays with receiver
pass_plays = pbp[(pbp["play_type"] == "pass") &
                 (pbp["receiver_player_id"].notna())]

# Team totals
team_targets = pass_plays.groupby("posteam").agg(
    team_targets=("play_id", "count"),
    team_air_yards=("air_yards", "sum")
).reset_index()

# Player stats
player_stats = (pass_plays.groupby(
    ["posteam", "receiver_player_id", "receiver_player_name"])
    .agg(
        targets=("play_id", "count"),
        air_yards=("air_yards", "sum"),
        receptions=("complete_pass", "sum"),
        yards=("yards_gained", "sum")
    )
    .reset_index())

# Calculate shares
target_share = player_stats.merge(team_targets, on="posteam")
target_share["target_share"] = target_share["targets"] / target_share["team_targets"]
target_share["air_yard_share"] = target_share["air_yards"] / target_share["team_air_yards"]
target_share["wopr"] = target_share["target_share"] * 1.5 + target_share["air_yard_share"] * 0.7

result = target_share.nlargest(25, "wopr")[
    ["receiver_player_name", "posteam", "target_share", "air_yard_share", "wopr"]
]
print(result)
Packages: nflfastR tidyverse nfl_data_py pandas
Snap Count Analysis
Analyze snap counts and playing time share for fantasy value.
Intermediate
library(nflfastR)
library(tidyverse)

# Load participation data
participation <- load_participation(2023)
rosters <- fast_scraper_roster(2023)

# Calculate snap counts
snap_counts <- participation %>%
  separate_rows(offense_players, sep = ";") %>%
  filter(offense_players != "") %>%
  group_by(nflverse_game_id, offense_players) %>%
  summarize(snaps = n(), .groups = "drop") %>%
  rename(gsis_id = offense_players)

# Join with roster for names
snap_counts <- snap_counts %>%
  left_join(rosters %>% select(gsis_id, full_name, position, team),
            by = "gsis_id")

# Calculate season totals
season_snaps <- snap_counts %>%
  group_by(gsis_id, full_name, position, team) %>%
  summarize(
    games = n(),
    total_snaps = sum(snaps),
    avg_snaps = mean(snaps),
    .groups = "drop"
  ) %>%
  filter(position %in% c("RB", "WR", "TE")) %>%
  arrange(desc(total_snaps))

print(season_snaps %>% head(30))
import nfl_data_py as nfl
import pandas as pd

# Load participation data
participation = nfl.import_snap_counts([2023])
rosters = nfl.import_rosters([2023])

# Aggregate by player
season_snaps = (participation.groupby("pfr_player_id")
    .agg(
        games=("week", "nunique"),
        total_off_snaps=("offense_snaps", "sum"),
        avg_off_snaps=("offense_snaps", "mean"),
        total_def_snaps=("defense_snaps", "sum")
    )
    .reset_index())

# Join with roster data
roster_subset = rosters[["pfr_id", "player_name", "position", "team"]].drop_duplicates()
season_snaps = season_snaps.merge(roster_subset, left_on="pfr_player_id",
                                   right_on="pfr_id", how="left")

# Filter skill positions
skill_snaps = season_snaps[season_snaps["position"].isin(["RB", "WR", "TE"])]
skill_snaps = skill_snaps.sort_values("total_off_snaps", ascending=False)

print("Top Snap Count Leaders (Skill Positions):")
print(skill_snaps.head(30))
Packages: nflfastR tidyverse nfl_data_py pandas
Touchdown Regression Candidates
Identify players due for positive or negative TD regression.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate TD rate vs expected
td_analysis <- pbp %>%
  filter(play_type == "pass", !is.na(receiver_player_id)) %>%
  group_by(receiver_player_id, receiver_player_name) %>%
  summarize(
    targets = n(),
    rz_targets = sum(yardline_100 <= 20, na.rm = TRUE),
    tds = sum(pass_touchdown),
    expected_td_rate = 0.10 * (rz_targets / targets),  # Simplified expected
    actual_td_rate = tds / targets,
    .groups = "drop"
  ) %>%
  filter(targets >= 50) %>%
  mutate(
    td_diff = actual_td_rate - expected_td_rate,
    regression_direction = case_when(
      td_diff > 0.03 ~ "Negative",
      td_diff < -0.03 ~ "Positive",
      TRUE ~ "Stable"
    )
  )

# Candidates for positive regression (underperformed)
positive_regression <- td_analysis %>%
  filter(regression_direction == "Positive") %>%
  arrange(td_diff)

print("Positive Regression Candidates (Due for more TDs):")
print(positive_regression %>% select(receiver_player_name, targets, tds, td_diff))

# Candidates for negative regression (overperformed)
negative_regression <- td_analysis %>%
  filter(regression_direction == "Negative") %>%
  arrange(desc(td_diff))

print("\nNegative Regression Candidates (TD rate unsustainable):")
print(negative_regression %>% select(receiver_player_name, targets, tds, td_diff))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Calculate TD rate analysis
pass_plays = pbp[(pbp["play_type"] == "pass") & (pbp["receiver_player_id"].notna())]

td_analysis = (pass_plays.groupby(["receiver_player_id", "receiver_player_name"])
    .agg(
        targets=("play_id", "count"),
        rz_targets=("yardline_100", lambda x: (x <= 20).sum()),
        tds=("pass_touchdown", "sum")
    )
    .reset_index())

td_analysis = td_analysis[td_analysis["targets"] >= 50]
td_analysis["rz_rate"] = td_analysis["rz_targets"] / td_analysis["targets"]
td_analysis["expected_td_rate"] = 0.10 * td_analysis["rz_rate"]
td_analysis["actual_td_rate"] = td_analysis["tds"] / td_analysis["targets"]
td_analysis["td_diff"] = td_analysis["actual_td_rate"] - td_analysis["expected_td_rate"]

# Positive regression candidates
positive = td_analysis[td_analysis["td_diff"] < -0.03].sort_values("td_diff")
print("Positive Regression Candidates (Due for more TDs):")
print(positive[["receiver_player_name", "targets", "tds", "td_diff"]].head(10))

# Negative regression candidates
negative = td_analysis[td_analysis["td_diff"] > 0.03].sort_values("td_diff", ascending=False)
print("\nNegative Regression Candidates (TD rate unsustainable):")
print(negative[["receiver_player_name", "targets", "tds", "td_diff"]].head(10))
Packages: nflfastR tidyverse nfl_data_py pandas
Red Zone Target Leaders
Identify top red zone target getters for TD upside.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Red zone targets analysis
rz_targets <- pbp %>%
  filter(play_type == "pass", yardline_100 <= 20, !is.na(receiver_player_id)) %>%
  group_by(receiver_player_id, receiver_player_name, posteam) %>%
  summarize(
    rz_targets = n(),
    rz_receptions = sum(complete_pass),
    rz_tds = sum(pass_touchdown),
    avg_depth = mean(air_yards, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    rz_catch_rate = rz_receptions / rz_targets * 100,
    rz_td_rate = rz_tds / rz_targets * 100
  ) %>%
  arrange(desc(rz_targets))

print("Top Red Zone Target Leaders:")
print(rz_targets %>% head(25))

# Inside 10 yard line
goalline <- pbp %>%
  filter(play_type == "pass", yardline_100 <= 10, !is.na(receiver_player_id)) %>%
  group_by(receiver_player_name) %>%
  summarize(
    gl_targets = n(),
    gl_tds = sum(pass_touchdown),
    .groups = "drop"
  ) %>%
  arrange(desc(gl_targets))

print("\nGoal Line Target Leaders (Inside 10):")
print(goalline %>% head(15))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Red zone targets
rz_plays = pbp[(pbp["play_type"] == "pass") &
               (pbp["yardline_100"] <= 20) &
               (pbp["receiver_player_id"].notna())]

rz_targets = (rz_plays.groupby(["receiver_player_id", "receiver_player_name", "posteam"])
    .agg(
        rz_targets=("play_id", "count"),
        rz_receptions=("complete_pass", "sum"),
        rz_tds=("pass_touchdown", "sum"),
        avg_depth=("air_yards", "mean")
    )
    .reset_index())

rz_targets["rz_catch_rate"] = rz_targets["rz_receptions"] / rz_targets["rz_targets"] * 100
rz_targets["rz_td_rate"] = rz_targets["rz_tds"] / rz_targets["rz_targets"] * 100

rz_targets = rz_targets.sort_values("rz_targets", ascending=False)

print("Top Red Zone Target Leaders:")
print(rz_targets.head(25))

# Goal line targets
gl_plays = pbp[(pbp["play_type"] == "pass") &
               (pbp["yardline_100"] <= 10) &
               (pbp["receiver_player_id"].notna())]

goalline = (gl_plays.groupby("receiver_player_name")
    .agg(gl_targets=("play_id", "count"), gl_tds=("pass_touchdown", "sum"))
    .reset_index()
    .sort_values("gl_targets", ascending=False))

print("\nGoal Line Target Leaders:")
print(goalline.head(15))
Packages: nflfastR tidyverse nfl_data_py pandas
DFS Ownership vs Value Analysis
Find undervalued players in daily fantasy lineups.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate fantasy points per game
fantasy_ppg <- pbp %>%
  filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
  mutate(
    player_id = coalesce(receiver_player_id, rusher_player_id),
    player_name = coalesce(receiver_player_name, rusher_player_name)
  ) %>%
  group_by(player_id, player_name, week) %>%
  summarize(
    rush_yards = sum(rushing_yards, na.rm = TRUE),
    rush_tds = sum(rush_touchdown, na.rm = TRUE),
    receptions = sum(complete_pass, na.rm = TRUE),
    rec_yards = sum(receiving_yards, na.rm = TRUE),
    rec_tds = sum(touchdown[!is.na(receiver_player_id)], na.rm = TRUE),
    fantasy_pts = rush_yards * 0.1 + rush_tds * 6 + receptions * 1 +
                  rec_yards * 0.1 + rec_tds * 6,
    .groups = "drop"
  )

# Calculate consistency metrics
consistency <- fantasy_ppg %>%
  group_by(player_id, player_name) %>%
  summarize(
    games = n(),
    avg_pts = mean(fantasy_pts),
    std_pts = sd(fantasy_pts),
    floor = quantile(fantasy_pts, 0.1),
    ceiling = quantile(fantasy_pts, 0.9),
    boom_rate = mean(fantasy_pts > 15) * 100,  # "Boom" games
    bust_rate = mean(fantasy_pts < 5) * 100,   # "Bust" games
    .groups = "drop"
  ) %>%
  filter(games >= 8) %>%
  mutate(
    consistency_score = avg_pts / (std_pts + 1),  # Higher is better
    upside_score = ceiling / avg_pts  # Higher ceiling relative to average
  ) %>%
  arrange(desc(consistency_score))

print("Most Consistent Fantasy Performers:")
print(consistency %>% select(player_name, avg_pts, boom_rate, bust_rate, consistency_score) %>% head(20))
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Calculate weekly fantasy points
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])

weekly = (skill_plays.groupby(["player_id", "player_name", "week"])
    .agg(
        rush_yards=("rushing_yards", "sum"),
        rush_tds=("rush_touchdown", "sum"),
        receptions=("complete_pass", "sum"),
        rec_yards=("receiving_yards", "sum"),
        rec_tds=("pass_touchdown", "sum")
    )
    .reset_index())

weekly["fantasy_pts"] = (weekly["rush_yards"] * 0.1 + weekly["rush_tds"] * 6 +
                          weekly["receptions"] * 1 + weekly["rec_yards"] * 0.1 +
                          weekly["rec_tds"] * 6)

# Consistency metrics
consistency = (weekly.groupby(["player_id", "player_name"])
    .agg(
        games=("fantasy_pts", "count"),
        avg_pts=("fantasy_pts", "mean"),
        std_pts=("fantasy_pts", "std"),
        floor=("fantasy_pts", lambda x: x.quantile(0.1)),
        ceiling=("fantasy_pts", lambda x: x.quantile(0.9)),
        boom_rate=("fantasy_pts", lambda x: (x > 15).mean() * 100),
        bust_rate=("fantasy_pts", lambda x: (x < 5).mean() * 100)
    )
    .reset_index())

consistency = consistency[consistency["games"] >= 8]
consistency["consistency_score"] = consistency["avg_pts"] / (consistency["std_pts"] + 1)

consistency = consistency.sort_values("consistency_score", ascending=False)
print("Most Consistent Fantasy Performers:")
print(consistency[["player_name", "avg_pts", "boom_rate", "bust_rate", "consistency_score"]].head(20))
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Rest of Season Rankings
Project remaining season value based on opportunities.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)
rosters <- fast_scraper_roster(2023)

# Get opportunity metrics
opportunities <- pbp %>%
  filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
  mutate(
    player_id = coalesce(receiver_player_id, rusher_player_id),
    player_name = coalesce(receiver_player_name, rusher_player_name),
    is_target = !is.na(receiver_player_id),
    is_carry = !is.na(rusher_player_id) & play_type == "run"
  ) %>%
  group_by(player_id, player_name, posteam) %>%
  summarize(
    games = n_distinct(game_id),
    targets = sum(is_target),
    carries = sum(is_carry),
    touches = targets + carries,
    fantasy_pts = sum(rushing_yards, na.rm = TRUE) * 0.1 +
                  sum(rush_touchdown, na.rm = TRUE) * 6 +
                  sum(complete_pass, na.rm = TRUE) * 1 +
                  sum(receiving_yards, na.rm = TRUE) * 0.1 +
                  sum(pass_touchdown[is_target], na.rm = TRUE) * 6,
    .groups = "drop"
  ) %>%
  mutate(
    touches_per_game = touches / games,
    pts_per_touch = fantasy_pts / touches,
    pts_per_game = fantasy_pts / games,
    # Project remaining games (assuming 17-game season)
    games_remaining = 17 - games,
    projected_ros_pts = pts_per_game * games_remaining
  ) %>%
  filter(games >= 5, touches >= 30) %>%
  arrange(desc(projected_ros_pts))

print("Rest of Season Fantasy Rankings:")
print(opportunities %>%
        select(player_name, games, pts_per_game, games_remaining, projected_ros_pts) %>%
        head(30))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Get opportunity metrics
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])
skill_plays["is_target"] = skill_plays["receiver_player_id"].notna()
skill_plays["is_carry"] = (skill_plays["rusher_player_id"].notna()) & (skill_plays["play_type"] == "run")

opportunities = (skill_plays.groupby(["player_id", "player_name", "posteam"])
    .agg(
        games=("game_id", "nunique"),
        targets=("is_target", "sum"),
        carries=("is_carry", "sum"),
        rush_yards=("rushing_yards", "sum"),
        rush_tds=("rush_touchdown", "sum"),
        receptions=("complete_pass", "sum"),
        rec_yards=("receiving_yards", "sum"),
        rec_tds=("pass_touchdown", "sum")
    )
    .reset_index())

opportunities["touches"] = opportunities["targets"] + opportunities["carries"]
opportunities["fantasy_pts"] = (opportunities["rush_yards"] * 0.1 +
                                 opportunities["rush_tds"] * 6 +
                                 opportunities["receptions"] * 1 +
                                 opportunities["rec_yards"] * 0.1 +
                                 opportunities["rec_tds"] * 6)
opportunities["pts_per_game"] = opportunities["fantasy_pts"] / opportunities["games"]
opportunities["games_remaining"] = 17 - opportunities["games"]
opportunities["projected_ros_pts"] = opportunities["pts_per_game"] * opportunities["games_remaining"]

opportunities = opportunities[(opportunities["games"] >= 5) & (opportunities["touches"] >= 30)]
opportunities = opportunities.sort_values("projected_ros_pts", ascending=False)

print("Rest of Season Fantasy Rankings:")
print(opportunities[["player_name", "games", "pts_per_game", "games_remaining", "projected_ros_pts"]].head(30))
Packages: nflfastR tidyverse nfl_data_py pandas
Trade Value Calculator
Calculate relative trade value for fantasy players.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate player values
player_values <- pbp %>%
  filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
  mutate(
    player_id = coalesce(receiver_player_id, rusher_player_id),
    player_name = coalesce(receiver_player_name, rusher_player_name)
  ) %>%
  group_by(player_id, player_name, posteam) %>%
  summarize(
    games = n_distinct(game_id),
    fantasy_pts = sum(rushing_yards, na.rm = TRUE) * 0.1 +
                  sum(rush_touchdown, na.rm = TRUE) * 6 +
                  sum(complete_pass, na.rm = TRUE) * 1 +
                  sum(receiving_yards, na.rm = TRUE) * 0.1 +
                  sum(pass_touchdown[!is.na(receiver_player_id)], na.rm = TRUE) * 6,
    .groups = "drop"
  ) %>%
  mutate(ppg = fantasy_pts / games) %>%
  filter(games >= 5)

# Create trade value scale (1-100)
player_values <- player_values %>%
  mutate(
    # Base value on PPG percentile
    value_percentile = percent_rank(ppg),
    # Scale to 1-100
    trade_value = round(value_percentile * 100),
    # Tier assignment
    tier = case_when(
      trade_value >= 90 ~ "Elite (Tier 1)",
      trade_value >= 75 ~ "Star (Tier 2)",
      trade_value >= 60 ~ "Starter (Tier 3)",
      trade_value >= 40 ~ "Flex (Tier 4)",
      TRUE ~ "Bench (Tier 5)"
    )
  ) %>%
  arrange(desc(trade_value))

print("Fantasy Trade Values:")
print(player_values %>%
        select(player_name, ppg, trade_value, tier) %>%
        head(40))
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Calculate player values
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])

player_values = (skill_plays.groupby(["player_id", "player_name", "posteam"])
    .agg(
        games=("game_id", "nunique"),
        rush_yards=("rushing_yards", "sum"),
        rush_tds=("rush_touchdown", "sum"),
        receptions=("complete_pass", "sum"),
        rec_yards=("receiving_yards", "sum"),
        rec_tds=("pass_touchdown", "sum")
    )
    .reset_index())

player_values["fantasy_pts"] = (player_values["rush_yards"] * 0.1 +
                                 player_values["rush_tds"] * 6 +
                                 player_values["receptions"] * 1 +
                                 player_values["rec_yards"] * 0.1 +
                                 player_values["rec_tds"] * 6)
player_values["ppg"] = player_values["fantasy_pts"] / player_values["games"]

player_values = player_values[player_values["games"] >= 5]

# Create trade value scale
player_values["value_percentile"] = player_values["ppg"].rank(pct=True)
player_values["trade_value"] = (player_values["value_percentile"] * 100).round()

def get_tier(val):
    if val >= 90: return "Elite (Tier 1)"
    elif val >= 75: return "Star (Tier 2)"
    elif val >= 60: return "Starter (Tier 3)"
    elif val >= 40: return "Flex (Tier 4)"
    else: return "Bench (Tier 5)"

player_values["tier"] = player_values["trade_value"].apply(get_tier)
player_values = player_values.sort_values("trade_value", ascending=False)

print("Fantasy Trade Values:")
print(player_values[["player_name", "ppg", "trade_value", "tier"]].head(40))
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Matchup Analysis
Evaluate player value based on defensive matchups.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)
schedules <- load_schedules(2023)

# Calculate defense vs position stats
def_vs_rb <- pbp %>%
  filter(play_type == "run", !is.na(epa)) %>%
  group_by(defteam) %>%
  summarize(
    plays = n(),
    rush_epa_allowed = mean(epa),
    rush_ypc_allowed = mean(yards_gained),
    rush_td_allowed = sum(rush_touchdown),
    .groups = "drop"
  ) %>%
  mutate(
    rb_rank = rank(rush_epa_allowed)  # Lower EPA = better defense
  )

def_vs_wr <- pbp %>%
  filter(play_type == "pass", !is.na(epa)) %>%
  group_by(defteam) %>%
  summarize(
    plays = n(),
    pass_epa_allowed = mean(epa),
    ypa_allowed = mean(yards_gained),
    pass_td_allowed = sum(pass_touchdown),
    .groups = "drop"
  ) %>%
  mutate(
    wr_rank = rank(pass_epa_allowed)  # Lower EPA = better defense
  )

# Combine defensive rankings
def_rankings <- def_vs_rb %>%
  select(defteam, rush_epa_allowed, rb_rank) %>%
  left_join(def_vs_wr %>% select(defteam, pass_epa_allowed, wr_rank), by = "defteam") %>%
  mutate(
    rb_matchup = case_when(
      rb_rank <= 8 ~ "Tough",
      rb_rank <= 24 ~ "Average",
      TRUE ~ "Favorable"
    ),
    wr_matchup = case_when(
      wr_rank <= 8 ~ "Tough",
      wr_rank <= 24 ~ "Average",
      TRUE ~ "Favorable"
    )
  )

print("Defensive Matchup Rankings:")
print(def_rankings %>% arrange(rb_rank))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])

# Defense vs RB
rushes = pbp[(pbp["play_type"] == "run") & (pbp["epa"].notna())]
def_vs_rb = (rushes.groupby("defteam")
    .agg(
        plays=("epa", "count"),
        rush_epa_allowed=("epa", "mean"),
        rush_ypc=("yards_gained", "mean"),
        rush_td=("rush_touchdown", "sum")
    )
    .reset_index())
def_vs_rb["rb_rank"] = def_vs_rb["rush_epa_allowed"].rank()

# Defense vs WR
passes = pbp[(pbp["play_type"] == "pass") & (pbp["epa"].notna())]
def_vs_wr = (passes.groupby("defteam")
    .agg(
        plays=("epa", "count"),
        pass_epa_allowed=("epa", "mean"),
        ypa=("yards_gained", "mean"),
        pass_td=("pass_touchdown", "sum")
    )
    .reset_index())
def_vs_wr["wr_rank"] = def_vs_wr["pass_epa_allowed"].rank()

# Combine
def_rankings = def_vs_rb[["defteam", "rush_epa_allowed", "rb_rank"]].merge(
    def_vs_wr[["defteam", "pass_epa_allowed", "wr_rank"]], on="defteam")

def rb_matchup(rank):
    if rank <= 8: return "Tough"
    elif rank <= 24: return "Average"
    else: return "Favorable"

def_rankings["rb_matchup"] = def_rankings["rb_rank"].apply(rb_matchup)
def_rankings["wr_matchup"] = def_rankings["wr_rank"].apply(rb_matchup)

print("Defensive Matchup Rankings:")
print(def_rankings.sort_values("rb_rank"))
Packages: nflfastR tidyverse nfl_data_py pandas
Waiver Wire Gems
Find under-the-radar players with emerging opportunity.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Find players with increasing opportunity
weekly_touches <- pbp %>%
  filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
  mutate(
    player_id = coalesce(receiver_player_id, rusher_player_id),
    player_name = coalesce(receiver_player_name, rusher_player_name),
    is_touch = TRUE
  ) %>%
  group_by(player_id, player_name, week, posteam) %>%
  summarize(
    touches = n(),
    fantasy_pts = sum(rushing_yards, na.rm = TRUE) * 0.1 +
                  sum(rush_touchdown, na.rm = TRUE) * 6 +
                  sum(complete_pass, na.rm = TRUE) * 1 +
                  sum(receiving_yards, na.rm = TRUE) * 0.1 +
                  sum(pass_touchdown[!is.na(receiver_player_id)], na.rm = TRUE) * 6,
    .groups = "drop"
  )

# Calculate trend (last 3 weeks vs first 3 weeks)
max_week <- max(weekly_touches$week)
trending <- weekly_touches %>%
  mutate(
    period = case_when(
      week >= max_week - 2 ~ "Recent",
      week <= 3 ~ "Early",
      TRUE ~ "Middle"
    )
  ) %>%
  filter(period %in% c("Recent", "Early")) %>%
  group_by(player_id, player_name, posteam, period) %>%
  summarize(
    avg_touches = mean(touches),
    avg_pts = mean(fantasy_pts),
    .groups = "drop"
  ) %>%
  pivot_wider(names_from = period, values_from = c(avg_touches, avg_pts)) %>%
  filter(!is.na(avg_touches_Recent), !is.na(avg_touches_Early)) %>%
  mutate(
    touch_trend = avg_touches_Recent - avg_touches_Early,
    pts_trend = avg_pts_Recent - avg_pts_Early
  ) %>%
  filter(avg_pts_Recent < 15) %>%  # Filter out established stars
  arrange(desc(touch_trend))

print("Waiver Wire Gems (Increasing Opportunity):")
print(trending %>%
        select(player_name, avg_touches_Early, avg_touches_Recent, touch_trend, pts_trend) %>%
        head(15))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Get weekly touches
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])

weekly = (skill_plays.groupby(["player_id", "player_name", "week", "posteam"])
    .agg(
        touches=("play_id", "count"),
        rush_yards=("rushing_yards", "sum"),
        rush_tds=("rush_touchdown", "sum"),
        receptions=("complete_pass", "sum"),
        rec_yards=("receiving_yards", "sum"),
        rec_tds=("pass_touchdown", "sum")
    )
    .reset_index())

weekly["fantasy_pts"] = (weekly["rush_yards"] * 0.1 + weekly["rush_tds"] * 6 +
                          weekly["receptions"] * 1 + weekly["rec_yards"] * 0.1 +
                          weekly["rec_tds"] * 6)

# Calculate trends
max_week = weekly["week"].max()
early = weekly[weekly["week"] <= 3].groupby(["player_id", "player_name"]).agg(
    avg_touches_early=("touches", "mean"),
    avg_pts_early=("fantasy_pts", "mean")
).reset_index()

recent = weekly[weekly["week"] >= max_week - 2].groupby(["player_id", "player_name"]).agg(
    avg_touches_recent=("touches", "mean"),
    avg_pts_recent=("fantasy_pts", "mean")
).reset_index()

trending = early.merge(recent, on=["player_id", "player_name"])
trending["touch_trend"] = trending["avg_touches_recent"] - trending["avg_touches_early"]
trending["pts_trend"] = trending["avg_pts_recent"] - trending["avg_pts_early"]

# Filter out established stars
trending = trending[trending["avg_pts_recent"] < 15]
trending = trending.sort_values("touch_trend", ascending=False)

print("Waiver Wire Gems (Increasing Opportunity):")
print(trending[["player_name", "avg_touches_early", "avg_touches_recent", "touch_trend", "pts_trend"]].head(15))
Packages: nflfastR tidyverse nfl_data_py pandas

Machine Learning

Apply ML techniques for predictions, clustering, and pattern recognition

Win Probability Model (Logistic Regression)
Build a basic win probability model using game state features.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Prepare training data
model_data <- pbp %>%
  filter(!is.na(score_differential), !is.na(wp)) %>%
  select(
    home_team, away_team, posteam,
    game_seconds_remaining, score_differential,
    down, ydstogo, yardline_100,
    posteam_timeouts_remaining, defteam_timeouts_remaining,
    receive_2h_ko,
    wp  # Target variable (use as proxy for outcome)
  ) %>%
  mutate(
    is_home = posteam == home_team,
    qtr = 4 - (game_seconds_remaining %/% 900)
  ) %>%
  drop_na()

# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]

# Fit logistic regression
model <- glm(
  wp ~ score_differential + game_seconds_remaining +
       is_home + down + ydstogo + yardline_100 +
       posteam_timeouts_remaining,
  data = train,
  family = "quasibinomial"
)

# Evaluate
test$pred_wp <- predict(model, test, type = "response")
test$correct <- (test$pred_wp > 0.5) == (test$wp > 0.5)

cat("Accuracy:", mean(test$correct) * 100, "%\n")
cat("Brier Score:", mean((test$pred_wp - test$wp)^2), "\n")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, brier_score_loss

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    pbp["score_differential"].notna() &
    pbp["wp"].notna()
][["home_team", "posteam", "game_seconds_remaining",
   "score_differential", "down", "ydstogo", "yardline_100",
   "posteam_timeouts_remaining", "wp"]].dropna()

model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)

# Features and target
features = ["score_differential", "game_seconds_remaining", "is_home",
            "down", "ydstogo", "yardline_100", "posteam_timeouts_remaining"]
X = model_data[features]
y = (model_data["wp"] > 0.5).astype(int)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Brier Score: {brier_score_loss(y_test, y_prob):.4f}")
Packages: nflfastR tidyverse nfl_data_py pandas scikit-learn
Player Clustering (K-Means)
Cluster receivers by their play style using statistical profiles.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Create receiver profiles
receiver_profiles <- pbp %>%
  filter(!is.na(receiver_player_id), play_type == "pass") %>%
  group_by(receiver_player_id, receiver_player_name) %>%
  summarize(
    targets = n(),
    catch_rate = mean(complete_pass),
    avg_depth = mean(air_yards, na.rm = TRUE),
    yac_per_rec = mean(yards_after_catch[complete_pass == 1], na.rm = TRUE),
    contested_rate = mean(defenders_in_box >= 7, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(targets >= 40) %>%
  drop_na()

# Scale features
features <- receiver_profiles %>%
  select(catch_rate, avg_depth, yac_per_rec)
scaled_features <- scale(features)

# K-means clustering
set.seed(42)
kmeans_result <- kmeans(scaled_features, centers = 4, nstart = 25)

# Add clusters to data
receiver_profiles$cluster <- factor(kmeans_result$cluster)

# Analyze clusters
cluster_summary <- receiver_profiles %>%
  group_by(cluster) %>%
  summarize(
    n_players = n(),
    avg_catch_rate = mean(catch_rate),
    avg_depth = mean(avg_depth),
    avg_yac = mean(yac_per_rec)
  )

print(cluster_summary)

# Name clusters based on characteristics
# Cluster 1: Deep threats (high depth)
# Cluster 2: Possession receivers (high catch rate)
# etc.
import nfl_data_py as nfl
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

pbp = nfl.import_pbp_data([2023])

# Create receiver profiles
pass_plays = pbp[(pbp["receiver_player_id"].notna()) &
                 (pbp["play_type"] == "pass")]

receiver_profiles = (pass_plays.groupby(
    ["receiver_player_id", "receiver_player_name"])
    .agg(
        targets=("play_id", "count"),
        catch_rate=("complete_pass", "mean"),
        avg_depth=("air_yards", "mean"),
        yac_per_rec=("yards_after_catch", lambda x: x[pass_plays.loc[x.index, "complete_pass"] == 1].mean())
    )
    .reset_index()
    .dropna())

receiver_profiles = receiver_profiles[receiver_profiles["targets"] >= 40]

# Scale features
features = receiver_profiles[["catch_rate", "avg_depth", "yac_per_rec"]]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42, n_init=25)
receiver_profiles["cluster"] = kmeans.fit_predict(scaled_features)

# Analyze clusters
cluster_summary = (receiver_profiles.groupby("cluster")
    .agg(
        n_players=("receiver_player_name", "count"),
        avg_catch_rate=("catch_rate", "mean"),
        avg_depth=("avg_depth", "mean"),
        avg_yac=("yac_per_rec", "mean")
    )
    .reset_index())

print(cluster_summary)
Packages: nflfastR tidyverse nfl_data_py pandas scikit-learn
Random Forest Play Prediction
Predict play type (pass vs run) using game state features.
Advanced
library(nflfastR)
library(tidyverse)
library(randomForest)

pbp <- load_pbp(2023)

# Prepare data for play prediction
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
  mutate(
    is_pass = as.factor(play_type == "pass"),
    is_home = posteam == home_team
  ) %>%
  select(is_pass, down, ydstogo, yardline_100, score_differential,
         game_seconds_remaining, is_home) %>%
  drop_na()

# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]

# Train random forest
rf_model <- randomForest(
  is_pass ~ .,
  data = train,
  ntree = 100,
  importance = TRUE
)

# Evaluate
test$pred <- predict(rf_model, test)
accuracy <- mean(test$pred == test$is_pass)
cat("Accuracy:", accuracy * 100, "%\n")

# Feature importance
importance(rf_model) %>% as.data.frame() %>%
  rownames_to_column("feature") %>%
  arrange(desc(MeanDecreaseGini)) %>%
  print()
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)

features = ["down", "ydstogo", "yardline_100", "score_differential",
            "game_seconds_remaining", "is_home"]
model_data = model_data[features + ["is_pass"]].dropna()

# Split data
X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
importance = pd.DataFrame({
    "feature": features,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)
print(importance)
Packages: nflfastR tidyverse randomForest nfl_data_py pandas scikit-learn
XGBoost Win Prediction
Build an XGBoost model to predict game outcomes.
Advanced
library(nflfastR)
library(tidyverse)
library(xgboost)

schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)

# Calculate team metrics per season
team_metrics <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(season, posteam) %>%
  summarize(
    off_epa = mean(epa),
    pass_rate = mean(play_type == "pass"),
    .groups = "drop"
  ) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
      group_by(season, defteam) %>%
      summarize(def_epa = mean(epa), .groups = "drop"),
    by = c("season", "posteam" = "defteam")
  )

# Create game features
games <- schedules %>%
  filter(!is.na(result)) %>%
  left_join(team_metrics, by = c("season", "home_team" = "posteam")) %>%
  rename(home_off = off_epa, home_def = def_epa, home_pass = pass_rate) %>%
  left_join(team_metrics, by = c("season", "away_team" = "posteam")) %>%
  rename(away_off = off_epa, away_def = def_epa, away_pass = pass_rate) %>%
  mutate(home_win = as.numeric(result > 0)) %>%
  select(home_off, home_def, home_pass, away_off, away_def, away_pass, home_win) %>%
  drop_na()

# Prepare for XGBoost
train_matrix <- as.matrix(games %>% select(-home_win))
labels <- games$home_win

# Train XGBoost
xgb_model <- xgboost(
  data = train_matrix,
  label = labels,
  nrounds = 100,
  objective = "binary:logistic",
  verbose = 0
)

# Feature importance
importance <- xgb.importance(model = xgb_model)
print(importance)
import nfl_data_py as nfl
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])

# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

team_off = (plays.groupby(["season", "posteam"])
    .agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
    .reset_index())

team_def = (plays.groupby(["season", "defteam"])
    .agg(def_epa=("epa", "mean"))
    .reset_index())

team_metrics = team_off.merge(team_def, left_on=["season", "posteam"],
                               right_on=["season", "defteam"])

# Create game features
games = schedules[schedules["result"].notna()].copy()
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
                    left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def", "pass_rate": "home_pass"})
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
                    left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def", "pass_rate": "away_pass"})

games["home_win"] = (games["result"] > 0).astype(int)
features = ["home_off", "home_def", "home_pass", "away_off", "away_def", "away_pass"]
games = games[features + ["home_win"]].dropna()

# Train XGBoost
X_train, X_test, y_train, y_test = train_test_split(
    games[features], games["home_win"], test_size=0.2, random_state=42)

model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
print(pd.DataFrame({"feature": features, "importance": model.feature_importances_}).sort_values("importance", ascending=False))
Packages: nflfastR tidyverse xgboost nfl_data_py pandas
Feature Engineering for NFL
Create advanced features for NFL prediction models.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Create advanced features
features <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  mutate(
    # Situational features
    is_neutral_game_script = abs(score_differential) <= 7,
    is_garbage_time = (qtr == 4 & abs(score_differential) > 21) |
                      (qtr >= 4 & game_seconds_remaining < 120 & abs(score_differential) > 14),
    is_two_minute = game_seconds_remaining <= 120 & qtr %in% c(2, 4),
    is_redzone = yardline_100 <= 20,
    is_goal_to_go = goal_to_go == 1,
    is_third_or_fourth = down >= 3,

    # Distance buckets
    distance_bucket = case_when(
      ydstogo <= 3 ~ "short",
      ydstogo <= 7 ~ "medium",
      ydstogo <= 10 ~ "long",
      TRUE ~ "very_long"
    ),

    # Field position buckets
    field_bucket = case_when(
      yardline_100 >= 80 ~ "own_deep",
      yardline_100 >= 50 ~ "own_side",
      yardline_100 >= 20 ~ "opponent",
      TRUE ~ "redzone"
    ),

    # Time features
    game_pct_complete = 1 - (game_seconds_remaining / 3600),
    half = if_else(qtr <= 2, 1, 2),

    # Rolling metrics (would need window function)
    pass_heavy = NA  # Placeholder for rolling pass rate
  )

# Show feature distribution
features %>%
  group_by(distance_bucket, field_bucket) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass"),
    avg_epa = mean(epa),
    .groups = "drop"
  ) %>%
  print()
import nfl_data_py as nfl
import pandas as pd
import numpy as np

pbp = nfl.import_pbp_data([2023])

# Create advanced features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))].copy()

# Situational features
plays["is_neutral"] = plays["score_differential"].abs() <= 7
plays["is_garbage_time"] = (
    ((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21)) |
    ((plays["qtr"] >= 4) & (plays["game_seconds_remaining"] < 120) &
     (plays["score_differential"].abs() > 14))
)
plays["is_two_minute"] = (plays["game_seconds_remaining"] <= 120) & (plays["qtr"].isin([2, 4]))
plays["is_redzone"] = plays["yardline_100"] <= 20
plays["is_third_or_fourth"] = plays["down"] >= 3

# Distance buckets
def distance_bucket(yd):
    if yd <= 3: return "short"
    elif yd <= 7: return "medium"
    elif yd <= 10: return "long"
    else: return "very_long"

plays["distance_bucket"] = plays["ydstogo"].apply(distance_bucket)

# Field position buckets
def field_bucket(yd):
    if yd >= 80: return "own_deep"
    elif yd >= 50: return "own_side"
    elif yd >= 20: return "opponent"
    else: return "redzone"

plays["field_bucket"] = plays["yardline_100"].apply(field_bucket)

# Time features
plays["game_pct_complete"] = 1 - (plays["game_seconds_remaining"] / 3600)
plays["half"] = np.where(plays["qtr"] <= 2, 1, 2)

# Show feature distribution
feature_dist = (plays.groupby(["distance_bucket", "field_bucket"])
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean()),
        avg_epa=("epa", "mean")
    )
    .reset_index())

print(feature_dist)
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Cross-Validation Strategies
Implement proper cross-validation for NFL prediction models.
Advanced
library(nflfastR)
library(tidyverse)
library(caret)

pbp <- load_pbp(2020:2023)

# Prepare data
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down), !is.na(epa)) %>%
  mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE))) %>%
  select(season, week, is_pass, down, ydstogo, yardline_100,
         score_differential, game_seconds_remaining) %>%
  drop_na()

# Time-series cross-validation (train on past, test on future)
results <- list()
for (test_year in 2021:2023) {
  train <- model_data %>% filter(season < test_year)
  test <- model_data %>% filter(season == test_year)

  model <- glm(is_pass ~ down + ydstogo + yardline_100 +
                         score_differential + game_seconds_remaining,
               data = train, family = "binomial")

  preds <- predict(model, test, type = "response")
  pred_class <- ifelse(preds > 0.5, TRUE, FALSE)
  accuracy <- mean(pred_class == test$is_pass)

  results[[as.character(test_year)]] <- accuracy
  cat("Test Year:", test_year, "- Accuracy:", round(accuracy * 100, 1), "%\n")
}

# Average performance
cat("\nAverage Accuracy:", round(mean(unlist(results)) * 100, 1), "%\n")

# Week-by-week validation (within season)
# This accounts for within-season learning
week_results <- model_data %>%
  filter(season == 2023) %>%
  group_by(week) %>%
  group_split() %>%
  map_dbl(function(test_week) {
    w <- unique(test_week$week)
    train <- model_data %>% filter(season == 2023, week < w)
    if (nrow(train) < 100) return(NA)

    model <- glm(is_pass ~ ., data = train %>% select(-season, -week),
                 family = "binomial")
    preds <- predict(model, test_week, type = "response") > 0.5
    mean(preds == test_week$is_pass)
  })

cat("\nWeek-by-Week Validation (2023):")
print(week_results, na.print = "N/A")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna()) &
    (pbp["epa"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[["season", "week"] + features + ["is_pass"]].dropna()

# Time-series cross-validation (train on past, test on future)
results = []
for test_year in [2021, 2022, 2023]:
    train = model_data[model_data["season"] < test_year]
    test = model_data[model_data["season"] == test_year]

    X_train, y_train = train[features], train["is_pass"]
    X_test, y_test = test[features], test["is_pass"]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    results.append(accuracy)
    print(f"Test Year: {test_year} - Accuracy: {accuracy * 100:.1f}%")

print(f"\nAverage Accuracy: {sum(results) / len(results) * 100:.1f}%")

# Alternative: sklearn TimeSeriesSplit
print("\nTimeSeriesSplit (3-fold):")
tscv = TimeSeriesSplit(n_splits=3)
X = model_data[features]
y = model_data["is_pass"]

for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    print(f"Fold {i+1}: {acc * 100:.1f}%")
Packages: nflfastR tidyverse caret nfl_data_py pandas scikit-learn
Model Evaluation Metrics
Evaluate prediction models with proper metrics for NFL data.
Advanced
library(nflfastR)
library(tidyverse)
library(pROC)

pbp <- load_pbp(2023)
schedules <- load_schedules(2023)

# Build a simple win probability prediction
model_data <- pbp %>%
  filter(!is.na(wp), !is.na(score_differential)) %>%
  select(game_id, play_id, wp, score_differential,
         game_seconds_remaining, posteam) %>%
  drop_na()

# Create actual outcome
game_outcomes <- schedules %>%
  filter(!is.na(result)) %>%
  mutate(home_win = result > 0) %>%
  select(game_id, home_team, home_win)

model_data <- model_data %>%
  left_join(game_outcomes, by = "game_id") %>%
  mutate(actual_win = (posteam == home_team & home_win) |
                      (posteam != home_team & !home_win))

# Simple model based on score differential and time
model <- glm(actual_win ~ score_differential + game_seconds_remaining,
             data = model_data, family = "binomial")

model_data$pred_wp <- predict(model, type = "response")

# Evaluation metrics
# 1. Brier Score (lower is better)
brier <- mean((model_data$pred_wp - model_data$actual_win)^2)
cat("Brier Score:", round(brier, 4), "\n")

# 2. Log Loss
log_loss <- -mean(model_data$actual_win * log(model_data$pred_wp + 1e-10) +
                  (1 - model_data$actual_win) * log(1 - model_data$pred_wp + 1e-10))
cat("Log Loss:", round(log_loss, 4), "\n")

# 3. AUC-ROC
roc_obj <- roc(model_data$actual_win, model_data$pred_wp, quiet = TRUE)
cat("AUC-ROC:", round(auc(roc_obj), 4), "\n")

# 4. Calibration by probability bucket
calibration <- model_data %>%
  mutate(wp_bucket = cut(pred_wp, breaks = seq(0, 1, 0.1))) %>%
  group_by(wp_bucket) %>%
  summarize(
    n = n(),
    predicted = mean(pred_wp),
    actual = mean(actual_win)
  )

print("\nCalibration by Probability Bucket:")
print(calibration)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (brier_score_loss, log_loss, roc_auc_score,
                             accuracy_score, confusion_matrix)

pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])

# Prepare data
model_data = pbp[
    (pbp["wp"].notna()) &
    (pbp["score_differential"].notna())
][["game_id", "wp", "score_differential", "game_seconds_remaining", "posteam"]].dropna()

# Get actual outcomes
outcomes = schedules[schedules["result"].notna()][["game_id", "home_team", "result"]].copy()
outcomes["home_win"] = outcomes["result"] > 0

model_data = model_data.merge(outcomes, on="game_id")
model_data["actual_win"] = (
    ((model_data["posteam"] == model_data["home_team"]) & model_data["home_win"]) |
    ((model_data["posteam"] != model_data["home_team"]) & ~model_data["home_win"])
).astype(int)

# Simple model
X = model_data[["score_differential", "game_seconds_remaining"]]
y = model_data["actual_win"]

model = LogisticRegression(max_iter=1000)
model.fit(X, y)
pred_wp = model.predict_proba(X)[:, 1]

# Evaluation metrics
print("Model Evaluation Metrics:")
print(f"Brier Score: {brier_score_loss(y, pred_wp):.4f}")
print(f"Log Loss: {log_loss(y, pred_wp):.4f}")
print(f"AUC-ROC: {roc_auc_score(y, pred_wp):.4f}")
print(f"Accuracy: {accuracy_score(y, (pred_wp > 0.5).astype(int)) * 100:.1f}%")

# Calibration
model_data["pred_wp"] = pred_wp
model_data["wp_bucket"] = pd.cut(pred_wp, bins=np.arange(0, 1.1, 0.1))

calibration = (model_data.groupby("wp_bucket")
    .agg(n=("actual_win", "count"),
         predicted=("pred_wp", "mean"),
         actual=("actual_win", "mean"))
    .reset_index())

print("\nCalibration by Probability Bucket:")
print(calibration)
Packages: nflfastR tidyverse pROC nfl_data_py pandas scikit-learn
Hyperparameter Tuning
Optimize model hyperparameters using grid search and cross-validation.
Advanced
library(nflfastR)
library(tidyverse)
library(caret)

pbp <- load_pbp(2023)

# Prepare data
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
  mutate(is_pass = factor(play_type == "pass")) %>%
  select(is_pass, down, ydstogo, yardline_100,
         score_differential, game_seconds_remaining) %>%
  drop_na() %>%
  sample_n(min(10000, n()))  # Subsample for speed

# Define hyperparameter grid for Random Forest
rf_grid <- expand.grid(
  mtry = c(2, 3, 4, 5)
)

# Cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

# Train with grid search
set.seed(42)
rf_tuned <- train(
  is_pass ~ .,
  data = model_data,
  method = "rf",
  trControl = train_control,
  tuneGrid = rf_grid,
  metric = "ROC",
  ntree = 50
)

# Results
print(rf_tuned)
print(rf_tuned$bestTune)

# Plot tuning results
plot(rf_tuned)

# Best model performance
confusionMatrix(rf_tuned)
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()

# Subsample for speed
model_data = model_data.sample(n=min(10000, len(model_data)), random_state=42)
X = model_data[features]
y = model_data["is_pass"]

# Define hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Smaller grid for faster demo
small_grid = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5]
}

# Grid search with cross-validation
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    rf,
    small_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

# Results
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score (AUC): {grid_search.best_score_:.4f}")

# All results
results = pd.DataFrame(grid_search.cv_results_)
print("\nTop 5 Parameter Combinations:")
print(results.nsmallest(5, "rank_test_score")[["params", "mean_test_score", "std_test_score"]])
Packages: nflfastR tidyverse caret nfl_data_py pandas scikit-learn
Ensemble Methods
Combine multiple models for better prediction accuracy.
Advanced
library(nflfastR)
library(tidyverse)
library(caret)
library(randomForest)

pbp <- load_pbp(2023)

# Prepare data
model_data <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
  mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE), labels = c("run", "pass"))) %>%
  select(is_pass, down, ydstogo, yardline_100,
         score_differential, game_seconds_remaining) %>%
  drop_na() %>%
  sample_n(min(5000, n()))

# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]

# Train individual models
# 1. Logistic Regression
glm_model <- glm(is_pass ~ ., data = train, family = "binomial")
glm_pred <- predict(glm_model, test, type = "response")

# 2. Random Forest
rf_model <- randomForest(is_pass ~ ., data = train, ntree = 100)
rf_pred <- predict(rf_model, test, type = "prob")[, "pass"]

# 3. Simple rule-based (pass rate by down)
rule_pred <- test %>%
  left_join(
    train %>% group_by(down) %>% summarize(base_rate = mean(is_pass == "pass")),
    by = "down"
  ) %>%
  pull(base_rate)

# Ensemble predictions (average)
ensemble_pred <- (glm_pred + rf_pred + rule_pred) / 3
ensemble_class <- ifelse(ensemble_pred > 0.5, "pass", "run")

# Evaluate each model
actual <- test$is_pass

results <- tibble(
  Model = c("Logistic", "Random Forest", "Rule-based", "Ensemble"),
  Accuracy = c(
    mean((glm_pred > 0.5) == (actual == "pass")),
    mean((rf_pred > 0.5) == (actual == "pass")),
    mean((rule_pred > 0.5) == (actual == "pass")),
    mean(ensemble_class == actual)
  )
)

print(results)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pbp = nfl.import_pbp_data([2023])

# Prepare data
model_data = pbp[
    (pbp["play_type"].isin(["pass", "run"])) &
    (pbp["down"].notna())
].copy()

model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()
model_data = model_data.sample(n=min(5000, len(model_data)), random_state=42)

X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Individual models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# Train and evaluate individually
results = []
for name, model in [("Logistic", lr), ("Random Forest", rf), ("KNN", knn)]:
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results.append({"Model": name, "Accuracy": acc})

# Voting Ensemble
voting = VotingClassifier(
    estimators=[("lr", lr), ("rf", rf), ("knn", knn)],
    voting="soft"
)
voting.fit(X_train, y_train)
results.append({"Model": "Voting Ensemble", "Accuracy": accuracy_score(y_test, voting.predict(X_test))})

# Stacking Ensemble
stacking = StackingClassifier(
    estimators=[("lr", LogisticRegression(max_iter=1000)),
                ("rf", RandomForestClassifier(n_estimators=50))],
    final_estimator=LogisticRegression()
)
stacking.fit(X_train, y_train)
results.append({"Model": "Stacking Ensemble", "Accuracy": accuracy_score(y_test, stacking.predict(X_test))})

print("Model Comparison:")
print(pd.DataFrame(results).sort_values("Accuracy", ascending=False))
Packages: nflfastR tidyverse caret randomForest nfl_data_py pandas scikit-learn
Neural Network for Spread Prediction
Build a simple neural network for predicting game spreads.
Advanced
library(nflfastR)
library(tidyverse)
library(nnet)

schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)

# Calculate team features
team_features <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(season, posteam) %>%
  summarize(
    off_epa = mean(epa),
    pass_rate = mean(play_type == "pass"),
    .groups = "drop"
  ) %>%
  left_join(
    pbp %>%
      filter(!is.na(epa)) %>%
      group_by(season, defteam) %>%
      summarize(def_epa = mean(epa), .groups = "drop"),
    by = c("season", "posteam" = "defteam")
  )

# Create game dataset
games <- schedules %>%
  filter(!is.na(result), !is.na(spread_line)) %>%
  left_join(team_features, by = c("season", "home_team" = "posteam")) %>%
  rename(home_off = off_epa, home_def = def_epa) %>%
  left_join(team_features %>% select(season, posteam, off_epa, def_epa),
            by = c("season", "away_team" = "posteam")) %>%
  rename(away_off = off_epa, away_def = def_epa) %>%
  mutate(
    spread_error = result + spread_line  # Positive = home beat spread
  ) %>%
  select(home_off, home_def, away_off, away_def, spread_error) %>%
  drop_na()

# Scale features
scaled <- scale(games %>% select(-spread_error))
games_scaled <- cbind(as.data.frame(scaled), spread_error = games$spread_error)

# Split data
set.seed(42)
train_idx <- sample(nrow(games_scaled), 0.8 * nrow(games_scaled))
train <- games_scaled[train_idx, ]
test <- games_scaled[-train_idx, ]

# Train neural network
nn_model <- nnet(
  spread_error ~ .,
  data = train,
  size = 10,  # Hidden layer size
  linout = TRUE,  # Linear output for regression
  maxit = 500,
  decay = 0.01
)

# Predict and evaluate
predictions <- predict(nn_model, test)
mae <- mean(abs(predictions - test$spread_error))
rmse <- sqrt(mean((predictions - test$spread_error)^2))

cat("MAE:", round(mae, 2), "\n")
cat("RMSE:", round(rmse, 2), "\n")
cat("Correct Side %:", round(mean((predictions > 0) == (test$spread_error > 0)) * 100, 1), "%\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])

# Calculate team features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

team_off = (plays.groupby(["season", "posteam"])
    .agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
    .reset_index())

team_def = (plays.groupby(["season", "defteam"])
    .agg(def_epa=("epa", "mean"))
    .reset_index())

team_features = team_off.merge(team_def, left_on=["season", "posteam"],
                                right_on=["season", "defteam"])

# Create game dataset
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
                    left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def"})
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
                    left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def"})

games["spread_error"] = games["result"] + games["spread_line"]
features = ["home_off", "home_def", "away_off", "away_def"]
games = games[features + ["spread_error"]].dropna()

# Scale and split
X = games[features]
y = games["spread_error"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train neural network
nn = MLPRegressor(hidden_layer_sizes=(16, 8), max_iter=500, random_state=42, early_stopping=True)
nn.fit(X_train, y_train)

# Evaluate
predictions = nn.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
correct_side = ((predictions > 0) == (y_test > 0)).mean()

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Correct Side %: {correct_side * 100:.1f}%")
Packages: nflfastR tidyverse nnet nfl_data_py pandas scikit-learn

Play-by-Play Analysis

Analyze play-level data including formations, personnel, and sequencing

Play Type Classification
Analyze play type distribution and tendencies.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Play type distribution
play_types <- pbp %>%
  filter(!is.na(play_type)) %>%
  count(play_type) %>%
  mutate(pct = n / sum(n) * 100) %>%
  arrange(desc(n))

print("Play Type Distribution:")
print(play_types)

# Play type by down
by_down <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
  group_by(down) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    .groups = "drop"
  )

print("\nPass Rate by Down:")
print(by_down)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Play type distribution
play_types = (pbp[pbp["play_type"].notna()]
    .groupby("play_type")
    .size()
    .reset_index(name="count")
    .sort_values("count", ascending=False))

play_types["pct"] = play_types["count"] / play_types["count"].sum() * 100
print("Play Type Distribution:")
print(play_types)

# Play type by down
by_down = (pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["down"].notna())]
    .groupby("down")
    .agg(
        plays=("play_type", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
    )
    .reset_index())

print("\nPass Rate by Down:")
print(by_down)
Packages: nflfastR tidyverse nfl_data_py pandas
Formation Analysis
Analyze offensive formation tendencies and effectiveness.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Formation usage
formation_usage <- pbp %>%
  filter(!is.na(offense_formation), play_type %in% c("pass", "run")) %>%
  group_by(offense_formation) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    epa = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE) * 100,
    .groups = "drop"
  ) %>%
  filter(plays >= 100) %>%
  arrange(desc(plays))

print(formation_usage)

# Formation by team
team_formations <- pbp %>%
  filter(!is.na(offense_formation), play_type %in% c("pass", "run")) %>%
  group_by(posteam, offense_formation) %>%
  summarize(plays = n(), .groups = "drop") %>%
  group_by(posteam) %>%
  mutate(pct = plays / sum(plays) * 100) %>%
  arrange(posteam, desc(pct))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Formation usage
plays = pbp[(pbp["offense_formation"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

formation_usage = (plays.groupby("offense_formation")
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

formation_usage = formation_usage[formation_usage["plays"] >= 100].sort_values("plays", ascending=False)
print("Formation Usage:")
print(formation_usage)
Packages: nflfastR tidyverse nfl_data_py pandas
Personnel Grouping Analysis
Analyze offensive personnel packages (11, 12, 21, etc.).
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Personnel grouping analysis
personnel <- pbp %>%
  filter(!is.na(offense_personnel), play_type %in% c("pass", "run")) %>%
  group_by(offense_personnel) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    epa = mean(epa, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(plays >= 200) %>%
  arrange(desc(plays))

print("Personnel Grouping Usage:")
print(personnel)

# Team personnel tendencies
team_personnel <- pbp %>%
  filter(!is.na(offense_personnel), play_type %in% c("pass", "run")) %>%
  group_by(posteam) %>%
  mutate(team_plays = n()) %>%
  group_by(posteam, offense_personnel) %>%
  summarize(
    plays = n(),
    pct = n() / first(team_plays) * 100,
    .groups = "drop"
  ) %>%
  filter(pct >= 5) %>%
  arrange(posteam, desc(pct))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Personnel grouping analysis
plays = pbp[(pbp["offense_personnel"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

personnel = (plays.groupby("offense_personnel")
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
        epa=("epa", "mean")
    )
    .reset_index())

personnel = personnel[personnel["plays"] >= 200].sort_values("plays", ascending=False)
print("Personnel Grouping Usage:")
print(personnel)
Packages: nflfastR tidyverse nfl_data_py pandas
Game Script Analysis
Analyze how teams adjust based on score differential.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Game script analysis
game_script <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(score_differential)) %>%
  mutate(
    script = case_when(
      score_differential >= 14 ~ "Up 14+",
      score_differential >= 7 ~ "Up 7-13",
      score_differential >= 1 ~ "Up 1-6",
      score_differential == 0 ~ "Tied",
      score_differential >= -6 ~ "Down 1-6",
      score_differential >= -13 ~ "Down 7-13",
      TRUE ~ "Down 14+"
    ),
    script = factor(script, levels = c("Down 14+", "Down 7-13", "Down 1-6",
                                        "Tied", "Up 1-6", "Up 7-13", "Up 14+"))
  ) %>%
  group_by(script) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    epa = mean(epa, na.rm = TRUE),
    .groups = "drop"
  )

print("Game Script Analysis:")
print(game_script)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Game script analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["score_differential"].notna())].copy()

def get_script(diff):
    if diff >= 14: return "Up 14+"
    elif diff >= 7: return "Up 7-13"
    elif diff >= 1: return "Up 1-6"
    elif diff == 0: return "Tied"
    elif diff >= -6: return "Down 1-6"
    elif diff >= -13: return "Down 7-13"
    else: return "Down 14+"

plays["script"] = plays["score_differential"].apply(get_script)

game_script = (plays.groupby("script")
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
        epa=("epa", "mean")
    )
    .reset_index())

# Order properly
order = ["Down 14+", "Down 7-13", "Down 1-6", "Tied", "Up 1-6", "Up 7-13", "Up 14+"]
game_script["script"] = pd.Categorical(game_script["script"], categories=order, ordered=True)
game_script = game_script.sort_values("script")

print("Game Script Analysis:")
print(game_script)
Packages: nflfastR tidyverse nfl_data_py pandas
No-Huddle Analysis
Analyze no-huddle offense usage and effectiveness.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# No-huddle analysis
no_huddle <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(no_huddle)) %>%
  mutate(no_huddle = if_else(no_huddle == 1, "No-Huddle", "Huddle")) %>%
  group_by(no_huddle) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    epa = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE) * 100,
    .groups = "drop"
  )

print("No-Huddle vs Huddle:")
print(no_huddle)

# Team no-huddle usage
team_no_huddle <- pbp %>%
  filter(play_type %in% c("pass", "run"), no_huddle == 1) %>%
  group_by(posteam) %>%
  summarize(
    no_huddle_plays = n(),
    epa = mean(epa, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(no_huddle_plays))

print("\nTop No-Huddle Teams:")
print(head(team_no_huddle, 10))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# No-huddle analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["no_huddle"].notna())].copy()
plays["huddle_type"] = plays["no_huddle"].apply(lambda x: "No-Huddle" if x == 1 else "Huddle")

no_huddle = (plays.groupby("huddle_type")
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

print("No-Huddle vs Huddle:")
print(no_huddle)

# Team usage
team_nh = (plays[plays["no_huddle"] == 1]
    .groupby("posteam")
    .agg(plays=("epa", "count"), epa=("epa", "mean"))
    .reset_index()
    .sort_values("plays", ascending=False))

print("\nTop No-Huddle Teams:")
print(team_nh.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas
Shotgun vs Under Center
Compare performance from shotgun vs under center formations.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Shotgun analysis
shotgun_analysis <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(shotgun)) %>%
  mutate(formation = if_else(shotgun == 1, "Shotgun", "Under Center")) %>%
  group_by(formation) %>%
  summarize(
    plays = n(),
    pct = n() / nrow(.) * 100,
    pass_rate = mean(play_type == "pass") * 100,
    epa = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE) * 100,
    .groups = "drop"
  )

print("Shotgun vs Under Center:")
print(shotgun_analysis)

# By play type
by_play_type <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(shotgun)) %>%
  mutate(formation = if_else(shotgun == 1, "Shotgun", "Under Center")) %>%
  group_by(formation, play_type) %>%
  summarize(
    plays = n(),
    epa = mean(epa, na.rm = TRUE),
    .groups = "drop"
  )

print("\nBy Play Type:")
print(by_play_type)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Shotgun analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["shotgun"].notna())].copy()
plays["formation"] = plays["shotgun"].apply(lambda x: "Shotgun" if x == 1 else "Under Center")

shotgun_analysis = (plays.groupby("formation")
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

shotgun_analysis["pct"] = shotgun_analysis["plays"] / shotgun_analysis["plays"].sum() * 100
print("Shotgun vs Under Center:")
print(shotgun_analysis)

# By play type
by_type = (plays.groupby(["formation", "play_type"])
    .agg(plays=("epa", "count"), epa=("epa", "mean"))
    .reset_index())

print("\nBy Play Type:")
print(by_type)
Packages: nflfastR tidyverse nfl_data_py pandas
Motion and Shift Analysis
Analyze pre-snap motion usage and its impact on play success.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Motion analysis
motion_analysis <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(is_motion)) %>%
  mutate(motion = if_else(is_motion == 1, "Motion", "No Motion")) %>%
  group_by(motion) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    epa = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE) * 100,
    .groups = "drop"
  )

print("Motion Impact on Play Success:")
print(motion_analysis)

# Team motion usage
team_motion <- pbp %>%
  filter(play_type %in% c("pass", "run")) %>%
  group_by(posteam) %>%
  summarize(
    total_plays = n(),
    motion_plays = sum(is_motion == 1, na.rm = TRUE),
    motion_rate = mean(is_motion == 1, na.rm = TRUE) * 100,
    motion_epa = mean(epa[is_motion == 1], na.rm = TRUE),
    no_motion_epa = mean(epa[is_motion != 1 | is.na(is_motion)], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(motion_advantage = motion_epa - no_motion_epa) %>%
  arrange(desc(motion_rate))

print("\nTeam Motion Usage:")
print(head(team_motion, 10))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Motion analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"]))].copy()

motion = plays[plays["is_motion"] == 1]
no_motion = plays[(plays["is_motion"] != 1) | (plays["is_motion"].isna())]

print("Motion Impact on Play Success:")
print(f"With Motion: {len(motion)} plays, EPA: {motion[\"epa\"].mean():.3f}")
print(f"No Motion: {len(no_motion)} plays, EPA: {no_motion[\"epa\"].mean():.3f}")

# Team motion usage
def calc_motion_stats(group):
    motion_plays = group[group["is_motion"] == 1]
    return pd.Series({
        "total_plays": len(group),
        "motion_plays": len(motion_plays),
        "motion_rate": len(motion_plays) / len(group) * 100 if len(group) > 0 else 0,
        "motion_epa": motion_plays["epa"].mean() if len(motion_plays) > 0 else 0
    })

team_motion = plays.groupby("posteam").apply(calc_motion_stats).reset_index()
team_motion = team_motion.sort_values("motion_rate", ascending=False)

print("\nTeam Motion Usage:")
print(team_motion.head(10))
Packages: nflfastR tidyverse nfl_data_py pandas
Play Sequencing Analysis
Analyze play-to-play sequencing and tendencies.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Add previous play info
play_sequence <- pbp %>%
  filter(play_type %in% c("pass", "run")) %>%
  group_by(game_id, posteam) %>%
  arrange(play_id) %>%
  mutate(
    prev_play = lag(play_type),
    prev_success = lag(success),
    prev_epa = lag(epa)
  ) %>%
  ungroup() %>%
  filter(!is.na(prev_play))

# Play calling after success/failure
after_result <- play_sequence %>%
  mutate(prev_result = if_else(prev_success == 1, "After Success", "After Failure")) %>%
  group_by(prev_result) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    epa = mean(epa, na.rm = TRUE),
    .groups = "drop"
  )

print("Play Calling After Success/Failure:")
print(after_result)

# Run-Run, Run-Pass, Pass-Run, Pass-Pass sequences
sequences <- play_sequence %>%
  mutate(sequence = paste(prev_play, play_type, sep = " -> ")) %>%
  group_by(sequence) %>%
  summarize(
    plays = n(),
    epa = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE) * 100,
    .groups = "drop"
  ) %>%
  arrange(desc(plays))

print("\nPlay Sequences:")
print(sequences)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter and sort
plays = pbp[pbp["play_type"].isin(["pass", "run"])].copy()
plays = plays.sort_values(["game_id", "play_id"])

# Add previous play info
plays["prev_play"] = plays.groupby(["game_id", "posteam"])["play_type"].shift(1)
plays["prev_success"] = plays.groupby(["game_id", "posteam"])["success"].shift(1)

plays = plays[plays["prev_play"].notna()]

# After success/failure
plays["prev_result"] = plays["prev_success"].apply(
    lambda x: "After Success" if x == 1 else "After Failure")

after_result = (plays.groupby("prev_result")
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
        epa=("epa", "mean")
    )
    .reset_index())

print("Play Calling After Success/Failure:")
print(after_result)

# Sequences
plays["sequence"] = plays["prev_play"] + " -> " + plays["play_type"]
sequences = (plays.groupby("sequence")
    .agg(
        plays=("epa", "count"),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index()
    .sort_values("plays", ascending=False))

print("\nPlay Sequences:")
print(sequences)
Packages: nflfastR tidyverse nfl_data_py pandas

Situational Analysis

Analyze decision-making in key game situations

Fourth Down Decision Analysis
Evaluate fourth down go-for-it decisions vs field goals/punts.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Fourth down decisions
fourth_downs <- pbp %>%
  filter(down == 4, !is.na(play_type)) %>%
  mutate(
    decision = case_when(
      play_type %in% c("pass", "run") ~ "Go for it",
      play_type == "field_goal" ~ "Field Goal",
      play_type == "punt" ~ "Punt",
      TRUE ~ "Other"
    )
  ) %>%
  filter(decision != "Other")

# Decision by field position
decision_analysis <- fourth_downs %>%
  mutate(
    zone = case_when(
      yardline_100 <= 3 ~ "Goal line (1-3)",
      yardline_100 <= 10 ~ "Red zone (4-10)",
      yardline_100 <= 40 ~ "Opp territory",
      yardline_100 <= 60 ~ "Midfield",
      TRUE ~ "Own territory"
    )
  ) %>%
  group_by(zone, decision) %>%
  summarize(plays = n(), .groups = "drop") %>%
  pivot_wider(names_from = decision, values_from = plays, values_fill = 0)

print("Fourth Down Decisions by Field Position:")
print(decision_analysis)

# Go-for-it success rate
go_for_it <- fourth_downs %>%
  filter(decision == "Go for it") %>%
  summarize(
    attempts = n(),
    conversions = sum(first_down == 1 | touchdown == 1, na.rm = TRUE),
    success_rate = mean(first_down == 1 | touchdown == 1, na.rm = TRUE) * 100
  )

print("\nGo-for-it Success Rate:")
print(go_for_it)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Fourth down decisions
fourth = pbp[(pbp["down"] == 4) & (pbp["play_type"].notna())].copy()

def get_decision(play_type):
    if play_type in ["pass", "run"]: return "Go for it"
    elif play_type == "field_goal": return "Field Goal"
    elif play_type == "punt": return "Punt"
    else: return "Other"

fourth["decision"] = fourth["play_type"].apply(get_decision)
fourth = fourth[fourth["decision"] != "Other"]

# Decision by field position
def get_zone(yd):
    if yd <= 3: return "Goal line (1-3)"
    elif yd <= 10: return "Red zone (4-10)"
    elif yd <= 40: return "Opp territory"
    elif yd <= 60: return "Midfield"
    else: return "Own territory"

fourth["zone"] = fourth["yardline_100"].apply(get_zone)

decision_analysis = (fourth.groupby(["zone", "decision"])
    .size()
    .reset_index(name="plays")
    .pivot(index="zone", columns="decision", values="plays")
    .fillna(0))

print("Fourth Down Decisions by Field Position:")
print(decision_analysis)

# Go-for-it success rate
go_for_it = fourth[fourth["decision"] == "Go for it"]
success_rate = ((go_for_it["first_down"] == 1) | (go_for_it["touchdown"] == 1)).mean() * 100
print(f"\nGo-for-it Success Rate: {success_rate:.1f}%")
Packages: nflfastR tidyverse nfl_data_py pandas
Two-Point Conversion Analysis
Analyze two-point conversion attempts and success rates.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2019:2023)

# Two-point conversion analysis
two_point <- pbp %>%
  filter(two_point_attempt == 1)

# Overall success rate
overall <- two_point %>%
  summarize(
    attempts = n(),
    successes = sum(two_point_conv_result == "success"),
    success_rate = mean(two_point_conv_result == "success") * 100
  )

print("Overall Two-Point Conversion Success:")
print(overall)

# By play type
by_type <- two_point %>%
  filter(play_type %in% c("pass", "run")) %>%
  group_by(play_type) %>%
  summarize(
    attempts = n(),
    successes = sum(two_point_conv_result == "success", na.rm = TRUE),
    success_rate = mean(two_point_conv_result == "success", na.rm = TRUE) * 100
  )

print("\nBy Play Type:")
print(by_type)

# Team leaders
team_2pt <- two_point %>%
  group_by(posteam) %>%
  summarize(
    attempts = n(),
    successes = sum(two_point_conv_result == "success", na.rm = TRUE),
    success_rate = mean(two_point_conv_result == "success", na.rm = TRUE) * 100
  ) %>%
  filter(attempts >= 5) %>%
  arrange(desc(success_rate))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2019, 2020, 2021, 2022, 2023])

# Two-point conversions
two_point = pbp[pbp["two_point_attempt"] == 1].copy()

# Overall success rate
success = (two_point["two_point_conv_result"] == "success")
print(f"Overall 2PT Success Rate: {success.mean() * 100:.1f}% ({success.sum()}/{len(two_point)})")

# By play type
by_type = (two_point[two_point["play_type"].isin(["pass", "run"])]
    .groupby("play_type")
    .agg(
        attempts=("two_point_conv_result", "count"),
        successes=("two_point_conv_result", lambda x: (x == "success").sum()),
        success_rate=("two_point_conv_result", lambda x: (x == "success").mean() * 100)
    )
    .reset_index())

print("\nBy Play Type:")
print(by_type)
Packages: nflfastR tidyverse nfl_data_py pandas
Clock Management Analysis
Evaluate late-game clock management decisions.
Advanced
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Late game clock management (4th quarter, close game)
late_game <- pbp %>%
  filter(
    qtr == 4,
    game_seconds_remaining <= 300,  # Last 5 minutes
    abs(score_differential) <= 8,
    play_type %in% c("pass", "run")
  )

# Play selection by score differential
late_plays <- late_game %>%
  mutate(
    situation = case_when(
      score_differential > 0 ~ "Leading",
      score_differential < 0 ~ "Trailing",
      TRUE ~ "Tied"
    )
  ) %>%
  group_by(situation) %>%
  summarize(
    plays = n(),
    pass_rate = mean(play_type == "pass") * 100,
    avg_play_clock = mean(play_clock, na.rm = TRUE),
    .groups = "drop"
  )

print("Late Game Play Selection:")
print(late_plays)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Late game clock management
late_game = pbp[
    (pbp["qtr"] == 4) &
    (pbp["game_seconds_remaining"] <= 300) &
    (pbp["score_differential"].abs() <= 8) &
    (pbp["play_type"].isin(["pass", "run"]))
].copy()

def get_situation(diff):
    if diff > 0: return "Leading"
    elif diff < 0: return "Trailing"
    else: return "Tied"

late_game["situation"] = late_game["score_differential"].apply(get_situation)

late_plays = (late_game.groupby("situation")
    .agg(
        plays=("epa", "count"),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
    )
    .reset_index())

print("Late Game Play Selection:")
print(late_plays)
Packages: nflfastR tidyverse nfl_data_py pandas
Timeout Usage Analysis
Analyze when teams use timeouts and their strategic impact.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Timeout usage by situation
timeouts <- pbp %>%
  filter(timeout == 1) %>%
  mutate(
    half = if_else(qtr <= 2, "First Half", "Second Half"),
    close_game = abs(score_differential) <= 8
  )

# Timeout distribution
timeout_dist <- timeouts %>%
  group_by(half, qtr) %>%
  summarize(
    timeouts = n(),
    .groups = "drop"
  )

print("Timeout Distribution by Quarter:")
print(timeout_dist)

# Team timeout usage patterns
team_timeouts <- timeouts %>%
  group_by(timeout_team) %>%
  summarize(
    total_timeouts = n(),
    first_half = sum(half == "First Half"),
    second_half = sum(half == "Second Half"),
    close_games = sum(close_game),
    avg_time_remaining = mean(game_seconds_remaining, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(total_timeouts))

print("\nTeam Timeout Usage:")
print(team_timeouts)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Filter timeouts
timeouts = pbp[pbp["timeout"] == 1].copy()
timeouts["half"] = timeouts["qtr"].apply(lambda x: "First Half" if x <= 2 else "Second Half")
timeouts["close_game"] = timeouts["score_differential"].abs() <= 8

# Distribution by quarter
timeout_dist = (timeouts.groupby(["half", "qtr"])
    .size()
    .reset_index(name="timeouts"))

print("Timeout Distribution by Quarter:")
print(timeout_dist)

# Team patterns
team_timeouts = (timeouts.groupby("timeout_team")
    .agg(
        total_timeouts=("timeout", "count"),
        first_half=("half", lambda x: (x == "First Half").sum()),
        second_half=("half", lambda x: (x == "Second Half").sum()),
        close_games=("close_game", "sum"),
        avg_time_remaining=("game_seconds_remaining", "mean")
    )
    .reset_index()
    .sort_values("total_timeouts", ascending=False))

print("\nTeam Timeout Usage:")
print(team_timeouts)
Packages: nflfastR tidyverse nfl_data_py pandas
Challenge Success Rates
Analyze coach challenge success rates and tendencies.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2019:2023)

# Replay review analysis
replays <- pbp %>%
  filter(!is.na(replay_or_challenge_result))

# Overall success rate
success_rate <- replays %>%
  mutate(overturned = replay_or_challenge_result == "overturned") %>%
  summarize(
    total_reviews = n(),
    overturned = sum(overturned),
    success_rate = mean(overturned) * 100
  )

print("Overall Challenge Results:")
print(success_rate)

# By year
by_year <- replays %>%
  mutate(
    overturned = replay_or_challenge_result == "overturned",
    year = season
  ) %>%
  group_by(year) %>%
  summarize(
    reviews = n(),
    overturned = sum(overturned),
    rate = mean(overturned) * 100,
    .groups = "drop"
  )

print("\nChallenge Success by Year:")
print(by_year)

# By play type challenged
by_type <- replays %>%
  mutate(overturned = replay_or_challenge_result == "overturned") %>%
  group_by(play_type) %>%
  summarize(
    reviews = n(),
    rate = mean(overturned) * 100,
    .groups = "drop"
  ) %>%
  filter(reviews >= 10) %>%
  arrange(desc(rate))

print("\nSuccess Rate by Play Type:")
print(by_type)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2019, 2020, 2021, 2022, 2023])

# Replay reviews
replays = pbp[pbp["replay_or_challenge_result"].notna()].copy()
replays["overturned"] = replays["replay_or_challenge_result"] == "overturned"

# Overall success rate
total = len(replays)
overturned = replays["overturned"].sum()
print(f"Overall Challenge Success: {overturned}/{total} ({overturned/total*100:.1f}%)")

# By year
by_year = (replays.groupby("season")
    .agg(
        reviews=("overturned", "count"),
        overturned=("overturned", "sum"),
        rate=("overturned", lambda x: x.mean() * 100)
    )
    .reset_index())

print("\nChallenge Success by Year:")
print(by_year)

# By play type
by_type = (replays.groupby("play_type")
    .agg(
        reviews=("overturned", "count"),
        rate=("overturned", lambda x: x.mean() * 100)
    )
    .reset_index())

by_type = by_type[by_type["reviews"] >= 10].sort_values("rate", ascending=False)
print("\nSuccess Rate by Play Type:")
print(by_type)
Packages: nflfastR tidyverse nfl_data_py pandas
Penalty Impact Analysis
Analyze penalty frequency, types, and their impact on games.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Penalty analysis
penalties <- pbp %>%
  filter(penalty == 1)

# Most common penalties
common_penalties <- penalties %>%
  filter(!is.na(penalty_type)) %>%
  count(penalty_type) %>%
  arrange(desc(n)) %>%
  head(15)

print("Most Common Penalties:")
print(common_penalties)

# Team penalty rates
team_penalties <- pbp %>%
  filter(play_type %in% c("pass", "run", "no_play")) %>%
  group_by(posteam) %>%
  summarize(
    plays = n(),
    penalties = sum(penalty == 1 & penalty_team == posteam, na.rm = TRUE),
    penalty_rate = mean(penalty == 1 & penalty_team == posteam, na.rm = TRUE) * 100,
    penalty_yards = sum(penalty_yards[penalty_team == posteam], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(penalty_rate))

print("\nTeam Penalty Rates:")
print(team_penalties)

# Penalty impact on EPA
penalty_impact <- pbp %>%
  filter(play_type %in% c("pass", "run")) %>%
  mutate(had_penalty = penalty == 1) %>%
  group_by(had_penalty) %>%
  summarize(
    plays = n(),
    avg_epa = mean(epa, na.rm = TRUE),
    .groups = "drop"
  )

print("\nEPA Impact of Penalties:")
print(penalty_impact)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Penalty analysis
penalties = pbp[pbp["penalty"] == 1]

# Most common
common = (penalties[penalties["penalty_type"].notna()]
    .groupby("penalty_type")
    .size()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
    .head(15))

print("Most Common Penalties:")
print(common)

# Team penalty rates
plays = pbp[pbp["play_type"].isin(["pass", "run", "no_play"])].copy()

def calc_team_penalties(group):
    team = group.name
    team_penalties = group[(group["penalty"] == 1) & (group["penalty_team"] == team)]
    return pd.Series({
        "plays": len(group),
        "penalties": len(team_penalties),
        "penalty_rate": len(team_penalties) / len(group) * 100 if len(group) > 0 else 0,
        "penalty_yards": team_penalties["penalty_yards"].sum()
    })

team_penalties = plays.groupby("posteam").apply(calc_team_penalties).reset_index()
team_penalties = team_penalties.sort_values("penalty_rate", ascending=False)

print("\nTeam Penalty Rates:")
print(team_penalties)

# EPA impact
plays["had_penalty"] = plays["penalty"] == 1
penalty_impact = (plays.groupby("had_penalty")
    .agg(plays=("epa", "count"), avg_epa=("epa", "mean"))
    .reset_index())

print("\nEPA Impact of Penalties:")
print(penalty_impact)
Packages: nflfastR tidyverse nfl_data_py pandas

Team Analysis

Comprehensive team-level analysis and comparisons

Season-Long Trend Analysis
Track team performance trends throughout the season.
Intermediate
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Calculate weekly EPA for each team
weekly_epa <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  group_by(posteam, week) %>%
  summarize(
    plays = n(),
    epa_per_play = mean(epa),
    success_rate = mean(success) * 100,
    .groups = "drop"
  )

# Calculate cumulative and rolling metrics
team_trends <- weekly_epa %>%
  group_by(posteam) %>%
  arrange(week) %>%
  mutate(
    cumulative_epa = cumsum(epa_per_play) / row_number(),
    rolling_3wk = zoo::rollmean(epa_per_play, k = 3, fill = NA, align = "right")
  ) %>%
  ungroup()

# Find improving and declining teams
week_1_3 <- team_trends %>%
  filter(week <= 3) %>%
  group_by(posteam) %>%
  summarize(early_epa = mean(epa_per_play))

week_15_plus <- team_trends %>%
  filter(week >= 15) %>%
  group_by(posteam) %>%
  summarize(late_epa = mean(epa_per_play))

trends <- week_1_3 %>%
  inner_join(week_15_plus, by = "posteam") %>%
  mutate(improvement = late_epa - early_epa) %>%
  arrange(desc(improvement))

print("Most Improved Teams:")
print(trends %>% head(5))
print("\nDeclined Most:")
print(trends %>% tail(5))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Weekly EPA
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]

weekly_epa = (plays.groupby(["posteam", "week"])
    .agg(
        plays=("epa", "count"),
        epa_per_play=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

# Compare early vs late season
early = weekly_epa[weekly_epa["week"] <= 3].groupby("posteam")["epa_per_play"].mean()
late = weekly_epa[weekly_epa["week"] >= 15].groupby("posteam")["epa_per_play"].mean()

trends = pd.DataFrame({"early_epa": early, "late_epa": late})
trends["improvement"] = trends["late_epa"] - trends["early_epa"]
trends = trends.sort_values("improvement", ascending=False).reset_index()

print("Most Improved Teams:")
print(trends.head(5))
print("\nDeclined Most:")
print(trends.tail(5))
Packages: nflfastR tidyverse zoo nfl_data_py pandas
Home vs Away Performance
Compare team performance at home vs on the road.
Beginner
library(nflfastR)
library(tidyverse)

pbp <- load_pbp(2023)

# Home vs Away EPA
home_away <- pbp %>%
  filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
  mutate(location = if_else(posteam == home_team, "Home", "Away")) %>%
  group_by(posteam, location) %>%
  summarize(
    plays = n(),
    epa = mean(epa),
    success_rate = mean(success) * 100,
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = location,
    values_from = c(plays, epa, success_rate)
  ) %>%
  mutate(
    home_advantage = epa_Home - epa_Away
  ) %>%
  arrange(desc(home_advantage))

print("Home vs Away Performance:")
print(home_away %>% select(posteam, epa_Home, epa_Away, home_advantage))
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2023])

# Home vs Away
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))].copy()
plays["location"] = plays.apply(lambda x: "Home" if x["posteam"] == x["home_team"] else "Away", axis=1)

home_away = (plays.groupby(["posteam", "location"])
    .agg(
        plays=("epa", "count"),
        epa=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100)
    )
    .reset_index())

# Pivot
home = home_away[home_away["location"] == "Home"].set_index("posteam")
away = home_away[home_away["location"] == "Away"].set_index("posteam")

comparison = pd.DataFrame({
    "Home EPA": home["epa"],
    "Away EPA": away["epa"]
})
comparison["Home Advantage"] = comparison["Home EPA"] - comparison["Away EPA"]
comparison = comparison.sort_values("Home Advantage", ascending=False).reset_index()

print("Home vs Away Performance:")
print(comparison)
Packages: nflfastR tidyverse nfl_data_py pandas
Point Differential Analysis
Analyze scoring and point differential trends.
Beginner
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2023)

# Calculate point differential stats
point_diff <- schedules %>%
  filter(!is.na(result)) %>%
  pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
  mutate(
    points_for = if_else(location == "home_team", home_score, away_score),
    points_against = if_else(location == "home_team", away_score, home_score),
    point_diff = points_for - points_against,
    win = point_diff > 0
  ) %>%
  group_by(team) %>%
  summarize(
    games = n(),
    wins = sum(win),
    total_pf = sum(points_for),
    total_pa = sum(points_against),
    total_diff = sum(point_diff),
    ppg = mean(points_for),
    papg = mean(points_against),
    avg_diff = mean(point_diff),
    .groups = "drop"
  ) %>%
  arrange(desc(total_diff))

print("Point Differential Rankings:")
print(point_diff)
import nfl_data_py as nfl
import pandas as pd

schedules = nfl.import_schedules([2023])

# Calculate for each team
games = schedules[schedules["result"].notna()].copy()

# Home games
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "points_for", "points_against"]

# Away games
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "points_for", "points_against"]

all_games = pd.concat([home, away])
all_games["point_diff"] = all_games["points_for"] - all_games["points_against"]
all_games["win"] = all_games["point_diff"] > 0

point_diff = (all_games.groupby("team")
    .agg(
        games=("point_diff", "count"),
        wins=("win", "sum"),
        total_pf=("points_for", "sum"),
        total_pa=("points_against", "sum"),
        total_diff=("point_diff", "sum"),
        ppg=("points_for", "mean"),
        avg_diff=("point_diff", "mean")
    )
    .reset_index()
    .sort_values("total_diff", ascending=False))

print("Point Differential Rankings:")
print(point_diff)
Packages: nflfastR tidyverse nfl_data_py pandas
Pythagorean Win Expectation
Calculate expected wins based on points scored and allowed.
Intermediate
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2023)

# Calculate team scoring
team_scoring <- schedules %>%
  filter(!is.na(result)) %>%
  pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
  mutate(
    pf = if_else(location == "home_team", home_score, away_score),
    pa = if_else(location == "home_team", away_score, home_score),
    win = pf > pa
  ) %>%
  group_by(team) %>%
  summarize(
    games = n(),
    actual_wins = sum(win),
    points_for = sum(pf),
    points_against = sum(pa),
    .groups = "drop"
  )

# Pythagorean expectation (exponent = 2.37 for NFL)
team_scoring <- team_scoring %>%
  mutate(
    pyth_exp = points_for^2.37 / (points_for^2.37 + points_against^2.37),
    expected_wins = pyth_exp * games,
    luck = actual_wins - expected_wins
  ) %>%
  arrange(desc(luck))

print("Pythagorean Win Expectation:")
print(team_scoring %>%
        select(team, actual_wins, expected_wins, luck) %>%
        mutate(across(c(expected_wins, luck), ~round(., 1))))
import nfl_data_py as nfl
import pandas as pd
import numpy as np

schedules = nfl.import_schedules([2023])
games = schedules[schedules["result"].notna()].copy()

# Home games
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "pf", "pa"]

# Away games
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "pf", "pa"]

all_games = pd.concat([home, away])
all_games["win"] = all_games["pf"] > all_games["pa"]

# Aggregate
team_scoring = (all_games.groupby("team")
    .agg(
        games=("win", "count"),
        actual_wins=("win", "sum"),
        points_for=("pf", "sum"),
        points_against=("pa", "sum")
    )
    .reset_index())

# Pythagorean expectation
exp = 2.37
team_scoring["pyth_exp"] = (team_scoring["points_for"]**exp /
                             (team_scoring["points_for"]**exp + team_scoring["points_against"]**exp))
team_scoring["expected_wins"] = team_scoring["pyth_exp"] * team_scoring["games"]
team_scoring["luck"] = team_scoring["actual_wins"] - team_scoring["expected_wins"]

team_scoring = team_scoring.sort_values("luck", ascending=False)

print("Pythagorean Win Expectation:")
print(team_scoring[["team", "actual_wins", "expected_wins", "luck"]].round(1))
Packages: nflfastR tidyverse nfl_data_py pandas numpy
Division Rival Performance
Analyze team performance against division rivals vs other opponents.
Intermediate
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2023)
teams <- load_teams()

# Add division info
games <- schedules %>%
  filter(!is.na(result)) %>%
  left_join(teams %>% select(team_abbr, team_division),
            by = c("home_team" = "team_abbr")) %>%
  rename(home_div = team_division) %>%
  left_join(teams %>% select(team_abbr, team_division),
            by = c("away_team" = "team_abbr")) %>%
  rename(away_div = team_division) %>%
  mutate(division_game = home_div == away_div)

# Create team-game records
home <- games %>%
  select(team = home_team, opponent = away_team, pf = home_score,
         pa = away_score, division_game)

away <- games %>%
  select(team = away_team, opponent = home_team, pf = away_score,
         pa = home_score, division_game)

all_games <- bind_rows(home, away) %>%
  mutate(
    win = pf > pa,
    margin = pf - pa
  )

# Division vs Non-Division performance
div_perf <- all_games %>%
  group_by(team, division_game) %>%
  summarize(
    games = n(),
    wins = sum(win),
    win_pct = mean(win) * 100,
    avg_margin = mean(margin),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = division_game,
    values_from = c(games, wins, win_pct, avg_margin),
    names_prefix = "div_"
  )

print("Division vs Non-Division Performance:")
print(div_perf %>%
        select(team, win_pct_div_TRUE, win_pct_div_FALSE) %>%
        mutate(div_advantage = win_pct_div_TRUE - win_pct_div_FALSE) %>%
        arrange(desc(div_advantage)))
import nfl_data_py as nfl
import pandas as pd

schedules = nfl.import_schedules([2023])
teams = nfl.import_team_desc()

# Add division info
games = schedules[schedules["result"].notna()].copy()
games = games.merge(teams[["team_abbr", "team_division"]],
                     left_on="home_team", right_on="team_abbr", how="left")
games = games.rename(columns={"team_division": "home_div"}).drop("team_abbr", axis=1)
games = games.merge(teams[["team_abbr", "team_division"]],
                     left_on="away_team", right_on="team_abbr", how="left")
games = games.rename(columns={"team_division": "away_div"}).drop("team_abbr", axis=1)
games["division_game"] = games["home_div"] == games["away_div"]

# Create team-game records
home = games[["home_team", "away_team", "home_score", "away_score", "division_game"]].copy()
home.columns = ["team", "opponent", "pf", "pa", "division_game"]

away = games[["away_team", "home_team", "away_score", "home_score", "division_game"]].copy()
away.columns = ["team", "opponent", "pf", "pa", "division_game"]

all_games = pd.concat([home, away])
all_games["win"] = all_games["pf"] > all_games["pa"]
all_games["margin"] = all_games["pf"] - all_games["pa"]

# Division vs Non-Division
div_perf = (all_games.groupby(["team", "division_game"])
    .agg(games=("win", "count"), wins=("win", "sum"),
         win_pct=("win", lambda x: x.mean() * 100), avg_margin=("margin", "mean"))
    .reset_index())

div_games = div_perf[div_perf["division_game"]].set_index("team")["win_pct"]
non_div = div_perf[~div_perf["division_game"]].set_index("team")["win_pct"]

comparison = pd.DataFrame({"Division": div_games, "Non-Division": non_div})
comparison["Advantage"] = comparison["Division"] - comparison["Non-Division"]
comparison = comparison.sort_values("Advantage", ascending=False).reset_index()

print("Division vs Non-Division Performance:")
print(comparison)
Packages: nflfastR tidyverse nfl_data_py pandas
Playoff vs Regular Season
Compare team performance in playoffs vs regular season.
Intermediate
library(nflfastR)
library(tidyverse)

# Load multiple years for playoff sample
pbp <- load_pbp(2020:2023)

# Separate regular season and playoffs
perf_comparison <- pbp %>%
  filter(play_type %in% c("pass", "run"), !is.na(epa)) %>%
  mutate(game_type = if_else(week <= 18, "Regular Season", "Playoffs")) %>%
  group_by(posteam, game_type) %>%
  summarize(
    plays = n(),
    epa_per_play = mean(epa),
    success_rate = mean(success, na.rm = TRUE) * 100,
    pass_rate = mean(play_type == "pass") * 100,
    .groups = "drop"
  ) %>%
  filter(plays >= 100)  # Minimum plays threshold

# Compare teams with playoff experience
playoff_teams <- perf_comparison %>%
  filter(game_type == "Playoffs") %>%
  pull(posteam) %>%
  unique()

comparison <- perf_comparison %>%
  filter(posteam %in% playoff_teams) %>%
  select(posteam, game_type, epa_per_play) %>%
  pivot_wider(names_from = game_type, values_from = epa_per_play) %>%
  mutate(
    playoff_bump = Playoffs - `Regular Season`
  ) %>%
  arrange(desc(playoff_bump))

print("Playoff vs Regular Season EPA (2020-2023):")
print(comparison)
import nfl_data_py as nfl
import pandas as pd

pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])

# Filter and categorize
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["epa"].notna())].copy()
plays["game_type"] = plays["week"].apply(lambda x: "Regular Season" if x <= 18 else "Playoffs")

# Performance by game type
perf = (plays.groupby(["posteam", "game_type"])
    .agg(
        plays=("epa", "count"),
        epa_per_play=("epa", "mean"),
        success_rate=("success", lambda x: x.mean() * 100),
        pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
    )
    .reset_index())

perf = perf[perf["plays"] >= 100]

# Teams with playoff experience
playoff_teams = perf[perf["game_type"] == "Playoffs"]["posteam"].unique()
comparison = perf[perf["posteam"].isin(playoff_teams)].copy()

# Pivot
reg = comparison[comparison["game_type"] == "Regular Season"].set_index("posteam")["epa_per_play"]
playoff = comparison[comparison["game_type"] == "Playoffs"].set_index("posteam")["epa_per_play"]

result = pd.DataFrame({"Regular Season": reg, "Playoffs": playoff})
result["Playoff Bump"] = result["Playoffs"] - result["Regular Season"]
result = result.sort_values("Playoff Bump", ascending=False).reset_index()

print("Playoff vs Regular Season EPA (2020-2023):")
print(result)
Packages: nflfastR tidyverse nfl_data_py pandas
Strength of Schedule Analysis
Calculate and compare strength of schedule across teams.
Advanced
library(nflfastR)
library(tidyverse)

schedules <- load_schedules(2023)

# Calculate team records
games <- schedules %>%
  filter(!is.na(result))

home <- games %>%
  select(team = home_team, pf = home_score, pa = away_score) %>%
  mutate(win = pf > pa)

away <- games %>%
  select(team = away_team, pf = away_score, pa = home_score) %>%
  mutate(win = pf > pa)

records <- bind_rows(home, away) %>%
  group_by(team) %>%
  summarize(
    wins = sum(win),
    losses = sum(!win),
    win_pct = mean(win),
    .groups = "drop"
  )

# Calculate SOS (average win pct of opponents)
calculate_sos <- function(team_name) {
  opponents <- c(
    games$away_team[games$home_team == team_name],
    games$home_team[games$away_team == team_name]
  )

  opp_records <- records %>%
    filter(team %in% opponents)

  mean(opp_records$win_pct)
}

sos <- records %>%
  rowwise() %>%
  mutate(
    sos = calculate_sos(team),
    sos_rank = NA
  ) %>%
  ungroup() %>%
  mutate(sos_rank = rank(-sos))

print("Strength of Schedule Rankings:")
print(sos %>%
        select(team, wins, losses, win_pct, sos) %>%
        arrange(desc(sos)) %>%
        mutate(
          sos = round(sos, 3),
          rank = row_number()
        ))
import nfl_data_py as nfl
import pandas as pd

schedules = nfl.import_schedules([2023])
games = schedules[schedules["result"].notna()].copy()

# Calculate team records
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "pf", "pa"]
home["win"] = home["pf"] > home["pa"]

away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "pf", "pa"]
away["win"] = away["pf"] > away["pa"]

all_games = pd.concat([home, away])
records = (all_games.groupby("team")
    .agg(wins=("win", "sum"), losses=("win", lambda x: (~x).sum()),
         win_pct=("win", "mean"))
    .reset_index())

# Calculate SOS
def calc_sos(team):
    home_opps = games[games["home_team"] == team]["away_team"].tolist()
    away_opps = games[games["away_team"] == team]["home_team"].tolist()
    opponents = home_opps + away_opps
    opp_records = records[records["team"].isin(opponents)]
    return opp_records["win_pct"].mean()

records["sos"] = records["team"].apply(calc_sos)
records = records.sort_values("sos", ascending=False)
records["rank"] = range(1, len(records) + 1)

print("Strength of Schedule Rankings:")
print(records[["rank", "team", "wins", "losses", "sos"]].round(3))
Packages: nflfastR tidyverse nfl_data_py pandas
Year-over-Year Improvement
Track which teams improved or regressed from previous season.
Intermediate
library(nflfastR)
library(tidyverse)

# Load two seasons
pbp_prev <- load_pbp(2022)
pbp_curr <- load_pbp(2023)

# Calculate EPA per play by team for each year
calc_team_epa <- function(pbp, year) {
  pbp %>%
    filter(play_type %in% c("pass", "run"), !is.na(epa)) %>%
    group_by(team = posteam) %>%
    summarize(
      plays = n(),
      epa_per_play = mean(epa),
      success_rate = mean(success, na.rm = TRUE) * 100,
      pass_rate = mean(play_type == "pass") * 100,
      .groups = "drop"
    ) %>%
    mutate(season = year)
}

epa_2022 <- calc_team_epa(pbp_prev, 2022)
epa_2023 <- calc_team_epa(pbp_curr, 2023)

# Compare
improvement <- epa_2022 %>%
  select(team, epa_2022 = epa_per_play, sr_2022 = success_rate) %>%
  inner_join(
    epa_2023 %>% select(team, epa_2023 = epa_per_play, sr_2023 = success_rate),
    by = "team"
  ) %>%
  mutate(
    epa_change = epa_2023 - epa_2022,
    sr_change = sr_2023 - sr_2022
  ) %>%
  arrange(desc(epa_change))

print("Year-over-Year EPA Change (2022 to 2023):")
print(improvement %>%
        mutate(across(where(is.numeric), ~round(., 3))))
import nfl_data_py as nfl
import pandas as pd

# Load two seasons
pbp_2022 = nfl.import_pbp_data([2022])
pbp_2023 = nfl.import_pbp_data([2023])

def calc_team_epa(pbp, year):
    plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["epa"].notna())]
    return (plays.groupby("posteam")
        .agg(
            plays=("epa", "count"),
            epa_per_play=("epa", "mean"),
            success_rate=("success", lambda x: x.mean() * 100),
            pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
        )
        .reset_index()
        .rename(columns={"posteam": "team"})
        .assign(season=year))

epa_2022 = calc_team_epa(pbp_2022, 2022)
epa_2023 = calc_team_epa(pbp_2023, 2023)

# Merge and compare
comparison = epa_2022[["team", "epa_per_play", "success_rate"]].merge(
    epa_2023[["team", "epa_per_play", "success_rate"]],
    on="team",
    suffixes=("_2022", "_2023")
)

comparison["epa_change"] = comparison["epa_per_play_2023"] - comparison["epa_per_play_2022"]
comparison["sr_change"] = comparison["success_rate_2023"] - comparison["success_rate_2022"]
comparison = comparison.sort_values("epa_change", ascending=False)

print("Year-over-Year EPA Change (2022 to 2023):")
print(comparison.round(3))
Packages: nflfastR tidyverse nfl_data_py pandas
Quick Package Reference
R Packages
  • nflfastR - Play-by-play data with EPA
  • nflplotR - NFL team logos & plotting
  • tidyverse - Data manipulation & visualization
  • ggplot2 - Advanced visualizations
Python Packages
  • nfl_data_py - NFL data (nflverse compatible)
  • pandas - Data manipulation
  • matplotlib - Visualizations
  • scikit-learn - Machine learning

Ready to Dive Deeper?

Learn the theory behind these techniques in our comprehensive tutorial series

Browse Tutorials