Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.
Load NFL play-by-play data using nflfastR, nfl_data_py, and various APIs
# Install and load nflfastR
# install.packages("nflfastR")
library(nflfastR)
library(tidyverse)
# Load single season
pbp_2023 <- load_pbp(2023)
# Load multiple seasons
pbp_multi <- load_pbp(2021:2023)
# View structure
glimpse(pbp_2023)
# Quick summary
cat("Total plays:", nrow(pbp_2023), "\n")
cat("Columns:", ncol(pbp_2023), "\n")
# Install: pip install nfl_data_py
import nfl_data_py as nfl
import pandas as pd
# Load single season
pbp_2023 = nfl.import_pbp_data([2023])
# Load multiple seasons
pbp_multi = nfl.import_pbp_data([2021, 2022, 2023])
# View structure
print(pbp_2023.info())
# Quick summary
print(f"Total plays: {len(pbp_2023)}")
print(f"Columns: {len(pbp_2023.columns)}")
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load current rosters
rosters <- fast_scraper_roster(2023)
# Load historical rosters
rosters_multi <- fast_scraper_roster(2020:2023)
# Filter to specific position
qbs <- rosters %>%
filter(position == "QB") %>%
select(full_name, team, age, years_exp, draft_number)
head(qbs)
import nfl_data_py as nfl
import pandas as pd
# Load current rosters
rosters = nfl.import_rosters([2023])
# Load historical rosters
rosters_multi = nfl.import_rosters([2020, 2021, 2022, 2023])
# Filter to specific position
qbs = rosters[rosters["position"] == "QB"][
["player_name", "team", "age", "years_exp", "draft_number"]
]
print(qbs.head())
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load Next Gen Stats - Passing
ngs_passing <- load_nextgen_stats(
seasons = 2023,
stat_type = "passing"
)
# Load Next Gen Stats - Rushing
ngs_rushing <- load_nextgen_stats(
seasons = 2023,
stat_type = "rushing"
)
# Load Next Gen Stats - Receiving
ngs_receiving <- load_nextgen_stats(
seasons = 2023,
stat_type = "receiving"
)
# View top passers by avg air yards
ngs_passing %>%
filter(week == 0) %>% # Season totals
arrange(desc(avg_air_yards_to_sticks)) %>%
select(player_display_name, team_abbr, avg_air_yards_to_sticks,
completion_percentage_above_expectation) %>%
head(10)
import nfl_data_py as nfl
import pandas as pd
# Load Next Gen Stats - Passing
ngs_passing = nfl.import_ngs_data(
stat_type="passing",
years=[2023]
)
# Load Next Gen Stats - Rushing
ngs_rushing = nfl.import_ngs_data(
stat_type="rushing",
years=[2023]
)
# Load Next Gen Stats - Receiving
ngs_receiving = nfl.import_ngs_data(
stat_type="receiving",
years=[2023]
)
# View top passers by avg air yards (season totals)
season_totals = ngs_passing[ngs_passing["week"] == 0]
top_passers = season_totals.nlargest(10, "avg_air_yards_to_sticks")[
["player_display_name", "team_abbr", "avg_air_yards_to_sticks",
"completion_percentage_above_expectation"]
]
print(top_passers)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load combine data
combine <- load_combine()
# Filter recent years
combine_recent <- combine %>%
filter(draft_year >= 2020)
# Top 40-yard dash times by position
combine_recent %>%
filter(!is.na(forty)) %>%
group_by(pos) %>%
slice_min(forty, n = 3) %>%
select(draft_year, player_name, pos, forty, vertical, broad_jump) %>%
arrange(pos, forty)
import nfl_data_py as nfl
import pandas as pd
# Load combine data
combine = nfl.import_combine_data()
# Filter recent years
combine_recent = combine[combine["draft_year"] >= 2020]
# Top 40-yard dash times by position
top_forty = (combine_recent[combine_recent["forty"].notna()]
.groupby("pos")
.apply(lambda x: x.nsmallest(3, "forty"))
.reset_index(drop=True)
[["draft_year", "player_name", "pos", "forty", "vertical", "broad_jump"]]
)
print(top_forty)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load schedules with results
schedules <- load_schedules(2020:2023)
# Filter to completed games
completed <- schedules %>%
filter(!is.na(result))
# Calculate home win percentage
home_win_pct <- completed %>%
summarize(
home_wins = sum(result > 0),
away_wins = sum(result < 0),
ties = sum(result == 0),
home_win_pct = home_wins / n()
)
print(home_win_pct)
# Spread analysis
schedules %>%
filter(!is.na(spread_line), !is.na(result)) %>%
mutate(
home_covered = result + spread_line > 0,
away_covered = result + spread_line < 0
) %>%
summarize(
home_cover_rate = mean(home_covered, na.rm = TRUE),
avg_spread = mean(spread_line),
avg_margin = mean(abs(result))
)
import nfl_data_py as nfl
import pandas as pd
# Load schedules with results
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])
# Filter to completed games
completed = schedules[schedules["result"].notna()]
# Calculate home win percentage
home_wins = (completed["result"] > 0).sum()
away_wins = (completed["result"] < 0).sum()
ties = (completed["result"] == 0).sum()
home_win_pct = home_wins / len(completed)
print(f"Home Win %: {home_win_pct:.1%}")
# Spread analysis
with_spread = completed[completed["spread_line"].notna()]
with_spread["home_covered"] = with_spread["result"] + with_spread["spread_line"] > 0
print(f"Home Cover Rate: {with_spread['home_covered'].mean():.1%}")
print(f"Avg Spread: {with_spread['spread_line'].mean():.1f}")
print(f"Avg Margin: {with_spread['result'].abs().mean():.1f}")
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load participation data
participation <- load_participation(2023)
# View structure
glimpse(participation)
# Analyze snap counts by player
snap_counts <- participation %>%
separate_rows(offense_players, sep = ";") %>%
filter(offense_players != "") %>%
group_by(offense_players) %>%
summarize(
snaps = n(),
games = n_distinct(game_id)
) %>%
arrange(desc(snaps))
head(snap_counts, 20)
import nfl_data_py as nfl
import pandas as pd
# Load participation data
participation = nfl.import_participation([2023])
# View structure
print(participation.info())
# Count offensive snaps (simplified)
# Note: participation data structure varies
print(f"Total plays with participation: {len(participation)}")
print(participation.head())
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load injuries data
injuries <- load_injuries(2023)
# View structure
glimpse(injuries)
# Count injuries by team and status
injury_summary <- injuries %>%
group_by(team, report_status) %>%
summarize(players = n(), .groups = "drop") %>%
pivot_wider(names_from = report_status, values_from = players, values_fill = 0)
print(injury_summary)
# Find most common injury types
injuries %>%
filter(!is.na(report_primary_injury)) %>%
count(report_primary_injury, sort = TRUE) %>%
head(10)
import nfl_data_py as nfl
import pandas as pd
# Load injuries data
injuries = nfl.import_injuries([2023])
# View structure
print(injuries.info())
# Count injuries by status
if "report_status" in injuries.columns:
status_counts = injuries["report_status"].value_counts()
print("Injury Status Distribution:")
print(status_counts)
# Most common injuries
if "report_primary_injury" in injuries.columns:
injury_types = injuries["report_primary_injury"].value_counts().head(10)
print("\nMost Common Injuries:")
print(injury_types)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load draft picks
draft <- load_draft_picks()
# Filter to recent years
recent_draft <- draft %>%
filter(season >= 2020)
# Analyze picks by position
position_analysis <- recent_draft %>%
group_by(season, position) %>%
summarize(
picks = n(),
avg_pick = mean(pick),
.groups = "drop"
)
# First round by position
first_round <- recent_draft %>%
filter(round == 1) %>%
count(position, sort = TRUE)
print(first_round)
# Top 10 picks each year
recent_draft %>%
filter(pick <= 10) %>%
select(season, pick, team, pfr_player_name, position) %>%
arrange(season, pick)
import nfl_data_py as nfl
import pandas as pd
# Load draft picks
draft = nfl.import_draft_picks()
# Filter to recent years
recent_draft = draft[draft["season"] >= 2020]
# First round picks by position
first_round = recent_draft[recent_draft["round"] == 1]
position_counts = first_round["position"].value_counts()
print("First Round Picks by Position (2020+):")
print(position_counts)
# Top 10 picks
top_picks = recent_draft[recent_draft["pick"] <= 10][
["season", "pick", "team", "pfr_player_name", "position"]
].sort_values(["season", "pick"])
print("\nTop 10 Picks:")
print(top_picks)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load snap counts
snaps <- load_snap_counts(2023)
# View structure
glimpse(snaps)
# Top offensive snap leaders
off_snaps <- snaps %>%
group_by(player, position, team) %>%
summarize(
games = n(),
total_off_snaps = sum(offense_snaps, na.rm = TRUE),
avg_off_snaps = mean(offense_snaps, na.rm = TRUE),
avg_off_pct = mean(offense_pct, na.rm = TRUE),
.groups = "drop"
) %>%
filter(games >= 10) %>%
arrange(desc(total_off_snaps))
# Top snap share by position
snaps %>%
filter(position %in% c("QB", "RB", "WR", "TE")) %>%
group_by(position) %>%
slice_max(offense_pct, n = 5) %>%
select(player, team, position, week, offense_snaps, offense_pct)
import nfl_data_py as nfl
import pandas as pd
# Load snap counts
snaps = nfl.import_snap_counts([2023])
# View structure
print(snaps.info())
# Top offensive snap leaders
off_snaps = (snaps.groupby(["player", "position", "team"])
.agg(
games=("week", "count"),
total_off_snaps=("offense_snaps", "sum"),
avg_off_snaps=("offense_snaps", "mean"),
avg_off_pct=("offense_pct", "mean")
)
.reset_index())
off_snaps = off_snaps[off_snaps["games"] >= 10].sort_values(
"total_off_snaps", ascending=False)
print("Top Offensive Snap Leaders:")
print(off_snaps.head(20))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load schedules for win totals context
schedules <- load_schedules(2023)
# Calculate actual wins per team
actual_wins <- schedules %>%
filter(!is.na(result)) %>%
mutate(
home_win = result > 0,
away_win = result < 0
) %>%
pivot_longer(
cols = c(home_team, away_team),
names_to = "location",
values_to = "team"
) %>%
mutate(
win = case_when(
location == "home_team" & result > 0 ~ 1,
location == "away_team" & result < 0 ~ 1,
result == 0 ~ 0.5,
TRUE ~ 0
)
) %>%
group_by(team) %>%
summarize(
games = n(),
wins = sum(win),
losses = games - wins
) %>%
arrange(desc(wins))
print(actual_wins)
import nfl_data_py as nfl
import pandas as pd
# Load schedules
schedules = nfl.import_schedules([2023])
# Filter to completed games
completed = schedules[schedules["result"].notna()]
# Calculate wins for home teams
home_wins = completed.groupby("home_team").apply(
lambda x: (x["result"] > 0).sum() + (x["result"] == 0).sum() * 0.5
).reset_index(name="home_wins")
# Calculate wins for away teams
away_wins = completed.groupby("away_team").apply(
lambda x: (x["result"] < 0).sum() + (x["result"] == 0).sum() * 0.5
).reset_index(name="away_wins")
# Combine
wins = home_wins.merge(away_wins, left_on="home_team", right_on="away_team")
wins["total_wins"] = wins["home_wins"] + wins["away_wins"]
wins = wins[["home_team", "total_wins"]].rename(columns={"home_team": "team"})
wins = wins.sort_values("total_wins", ascending=False)
print("Team Win Totals:")
print(wins)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load multiple data sources
pbp <- load_pbp(2023)
rosters <- fast_scraper_roster(2023)
# Get passer stats with roster info
passer_stats <- pbp %>%
filter(!is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
attempts = n(),
completions = sum(complete_pass),
yards = sum(passing_yards, na.rm = TRUE),
tds = sum(pass_touchdown),
ints = sum(interception),
epa = sum(epa, na.rm = TRUE),
.groups = "drop"
)
# Merge with roster data
passer_with_roster <- passer_stats %>%
left_join(
rosters %>% select(gsis_id, full_name, team, age, height, weight, college),
by = c("passer_player_id" = "gsis_id")
)
# View combined data
passer_with_roster %>%
filter(attempts >= 100) %>%
select(passer_player_name, team, age, college, attempts, yards, tds, epa) %>%
arrange(desc(epa))
import nfl_data_py as nfl
import pandas as pd
# Load multiple data sources
pbp = nfl.import_pbp_data([2023])
rosters = nfl.import_rosters([2023])
# Get passer stats
passer_stats = (pbp[pbp["passer_player_id"].notna()]
.groupby(["passer_player_id", "passer_player_name"])
.agg(
attempts=("play_id", "count"),
completions=("complete_pass", "sum"),
yards=("passing_yards", "sum"),
tds=("pass_touchdown", "sum"),
ints=("interception", "sum"),
epa=("epa", "sum")
)
.reset_index())
# Merge with roster data
passer_with_roster = passer_stats.merge(
rosters[["gsis_id", "player_name", "team", "age", "college"]],
left_on="passer_player_id",
right_on="gsis_id",
how="left"
)
# View combined data
result = (passer_with_roster[passer_with_roster["attempts"] >= 100]
.sort_values("epa", ascending=False)
[["passer_player_name", "team", "age", "college", "attempts", "yards", "tds", "epa"]])
print(result)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load QBR data
qbr <- load_espn_qbr(
league = "nfl",
seasons = 2023,
summary_type = "season"
)
# View structure
glimpse(qbr)
# Top QBs by Total QBR
qbr %>%
arrange(desc(qbr_total)) %>%
select(name_display, team_abb, qbr_total, pts_added, pass_rating) %>%
head(15)
# Weekly QBR trends
qbr_weekly <- load_espn_qbr(
league = "nfl",
seasons = 2023,
summary_type = "week"
)
# Best single-game performances
qbr_weekly %>%
arrange(desc(qbr_total)) %>%
select(name_display, week, team_abb, qbr_total, pts_added) %>%
head(10)
import nfl_data_py as nfl
import pandas as pd
# Load QBR data
qbr = nfl.import_qbr([2023])
# View structure
print(qbr.info())
# Top QBs by QBR (if available)
if "qbr_total" in qbr.columns:
top_qbr = qbr.nlargest(15, "qbr_total")[
["name_display", "team_abb", "qbr_total", "pts_added", "pass_rating"]
]
print("Top QBs by Total QBR:")
print(top_qbr)
else:
print("Available columns:", qbr.columns.tolist())
print(qbr.head())
nflfastR
tidyverse
nfl_data_py
pandas
Calculate and analyze Expected Points Added for teams, players, and situations
library(nflfastR)
library(tidyverse)
# Load data
pbp <- load_pbp(2023)
# Filter to pass and rush plays
plays <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run"))
# Calculate offensive EPA/play
offense_epa <- plays %>%
group_by(posteam) %>%
summarize(
plays = n(),
total_epa = sum(epa),
epa_per_play = mean(epa),
pass_epa = mean(epa[play_type == "pass"]),
rush_epa = mean(epa[play_type == "run"]),
success_rate = mean(success)
) %>%
arrange(desc(epa_per_play))
# Calculate defensive EPA/play (lower is better)
defense_epa <- plays %>%
group_by(defteam) %>%
summarize(
plays = n(),
epa_allowed = mean(epa),
pass_epa_allowed = mean(epa[play_type == "pass"]),
rush_epa_allowed = mean(epa[play_type == "run"])
) %>%
arrange(epa_allowed)
print(offense_epa)
print(defense_epa)
import nfl_data_py as nfl
import pandas as pd
# Load data
pbp = nfl.import_pbp_data([2023])
# Filter to pass and rush plays
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))]
# Calculate offensive EPA/play
offense_epa = (plays.groupby("posteam")
.agg(
plays=("epa", "count"),
total_epa=("epa", "sum"),
epa_per_play=("epa", "mean"),
success_rate=("success", "mean")
)
.sort_values("epa_per_play", ascending=False)
.reset_index())
# Calculate defensive EPA/play
defense_epa = (plays.groupby("defteam")
.agg(epa_allowed=("epa", "mean"))
.sort_values("epa_allowed")
.reset_index())
print("Offensive EPA/Play:")
print(offense_epa.head(10))
print("\nDefensive EPA/Play:")
print(defense_epa.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# QB EPA analysis
qb_epa <- pbp %>%
filter(!is.na(epa), !is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
dropbacks = n(),
total_epa = sum(epa),
epa_per_dropback = mean(epa),
cpoe = mean(cpoe, na.rm = TRUE),
success_rate = mean(success),
avg_air_yards = mean(air_yards, na.rm = TRUE),
.groups = "drop"
) %>%
filter(dropbacks >= 200) %>%
arrange(desc(epa_per_dropback))
# Add rank
qb_epa <- qb_epa %>%
mutate(rank = row_number())
print(qb_epa, n = 20)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter to pass plays with QB identified
pass_plays = pbp[(pbp["epa"].notna()) &
(pbp["passer_player_id"].notna())]
# QB EPA analysis
qb_epa = (pass_plays.groupby(["passer_player_id", "passer_player_name"])
.agg(
dropbacks=("epa", "count"),
total_epa=("epa", "sum"),
epa_per_dropback=("epa", "mean"),
cpoe=("cpoe", "mean"),
success_rate=("success", "mean"),
avg_air_yards=("air_yards", "mean")
)
.reset_index())
# Filter min dropbacks and rank
qb_epa = (qb_epa[qb_epa["dropbacks"] >= 200]
.sort_values("epa_per_dropback", ascending=False)
.reset_index(drop=True))
qb_epa["rank"] = range(1, len(qb_epa) + 1)
print(qb_epa.head(20))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# EPA by down
epa_by_down <- pbp %>%
filter(!is.na(epa), down <= 4) %>%
group_by(down) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
pass_rate = mean(play_type == "pass", na.rm = TRUE),
success_rate = mean(success)
)
print(epa_by_down)
# EPA by distance buckets
epa_by_distance <- pbp %>%
filter(!is.na(epa), down == 1) %>%
mutate(
distance_bucket = case_when(
ydstogo <= 3 ~ "1-3 yards",
ydstogo <= 7 ~ "4-7 yards",
ydstogo <= 10 ~ "8-10 yards",
TRUE ~ "11+ yards"
)
) %>%
group_by(distance_bucket) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
success_rate = mean(success)
)
# EPA by field position
epa_by_field <- pbp %>%
filter(!is.na(epa)) %>%
mutate(
field_zone = case_when(
yardline_100 >= 80 ~ "Own 1-20",
yardline_100 >= 60 ~ "Own 21-40",
yardline_100 >= 40 ~ "Midfield",
yardline_100 >= 20 ~ "Opp 21-40",
TRUE ~ "Red Zone"
)
) %>%
group_by(field_zone) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
td_rate = mean(touchdown, na.rm = TRUE)
)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# EPA by down
plays = pbp[(pbp["epa"].notna()) & (pbp["down"] <= 4)]
epa_by_down = (plays.groupby("down")
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
success_rate=("success", "mean")
)
.reset_index())
print("EPA by Down:")
print(epa_by_down)
# EPA by field position
def get_field_zone(yard):
if yard >= 80: return "Own 1-20"
elif yard >= 60: return "Own 21-40"
elif yard >= 40: return "Midfield"
elif yard >= 20: return "Opp 21-40"
else: return "Red Zone"
plays["field_zone"] = plays["yardline_100"].apply(get_field_zone)
epa_by_field = (plays.groupby("field_zone")
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
td_rate=("touchdown", "mean")
)
.reset_index())
print("\nEPA by Field Position:")
print(epa_by_field)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Game-level EPA
game_epa <- pbp %>%
filter(!is.na(epa)) %>%
group_by(game_id, posteam, week) %>%
summarize(
epa_per_play = mean(epa),
.groups = "drop"
)
# Calculate 4-game rolling average
rolling_epa <- game_epa %>%
arrange(posteam, week) %>%
group_by(posteam) %>%
mutate(
rolling_4_epa = zoo::rollmean(epa_per_play, k = 4,
fill = NA, align = "right")
) %>%
ungroup()
# Find teams that improved most
improvement <- rolling_epa %>%
filter(week %in% c(5, 17)) %>% # After 4 games vs end
select(posteam, week, rolling_4_epa) %>%
pivot_wider(names_from = week, values_from = rolling_4_epa,
names_prefix = "week_") %>%
mutate(improvement = week_17 - week_5) %>%
arrange(desc(improvement))
print(improvement)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Game-level EPA
game_epa = (pbp[pbp["epa"].notna()]
.groupby(["game_id", "posteam", "week"])
.agg(epa_per_play=("epa", "mean"))
.reset_index())
# Calculate 4-game rolling average
game_epa = game_epa.sort_values(["posteam", "week"])
game_epa["rolling_4_epa"] = (game_epa.groupby("posteam")
["epa_per_play"].transform(lambda x: x.rolling(4).mean()))
# Find teams that improved most
early = game_epa[game_epa["week"] == 5][["posteam", "rolling_4_epa"]]
late = game_epa[game_epa["week"] == 17][["posteam", "rolling_4_epa"]]
improvement = early.merge(late, on="posteam", suffixes=("_early", "_late"))
improvement["change"] = improvement["rolling_4_epa_late"] - improvement["rolling_4_epa_early"]
improvement = improvement.sort_values("change", ascending=False)
print(improvement)
nflfastR
tidyverse
zoo
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# EPA by quarter and play type
epa_by_qtr <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run"), qtr <= 4) %>%
group_by(qtr, play_type) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
success_rate = mean(success),
.groups = "drop"
)
print(epa_by_qtr)
# Pass vs Run EPA advantage by quarter
epa_by_qtr %>%
pivot_wider(names_from = play_type, values_from = c(avg_epa, success_rate)) %>%
mutate(pass_advantage = avg_epa_pass - avg_epa_run)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["qtr"] <= 4)]
# EPA by quarter and play type
epa_by_qtr = (plays.groupby(["qtr", "play_type"])
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
success_rate=("success", "mean")
)
.reset_index())
print("EPA by Quarter and Play Type:")
print(epa_by_qtr)
# Pass vs Run comparison
pivot = epa_by_qtr.pivot(index="qtr", columns="play_type", values="avg_epa")
pivot["pass_advantage"] = pivot["pass"] - pivot["run"]
print("\nPass EPA Advantage by Quarter:")
print(pivot)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Red zone analysis (inside opponent 20)
red_zone <- pbp %>%
filter(!is.na(epa), yardline_100 <= 20, play_type %in% c("pass", "run"))
# Team red zone efficiency
team_rz <- red_zone %>%
group_by(posteam) %>%
summarize(
rz_plays = n(),
rz_epa = mean(epa),
rz_td_rate = mean(touchdown, na.rm = TRUE),
rz_success_rate = mean(success),
pass_rate = mean(play_type == "pass")
) %>%
arrange(desc(rz_td_rate))
print(team_rz)
# Red zone vs rest of field comparison
rbind(
red_zone %>% summarize(zone = "Red Zone", epa = mean(epa), success = mean(success)),
pbp %>% filter(yardline_100 > 20, !is.na(epa)) %>%
summarize(zone = "Outside RZ", epa = mean(epa), success = mean(success))
)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Red zone plays (inside opponent 20)
red_zone = pbp[(pbp["epa"].notna()) &
(pbp["yardline_100"] <= 20) &
(pbp["play_type"].isin(["pass", "run"]))]
# Team red zone efficiency
team_rz = (red_zone.groupby("posteam")
.agg(
rz_plays=("epa", "count"),
rz_epa=("epa", "mean"),
rz_td_rate=("touchdown", "mean"),
rz_success_rate=("success", "mean")
)
.reset_index()
.sort_values("rz_td_rate", ascending=False))
print("Team Red Zone Efficiency:")
print(team_rz)
# Red zone vs outside comparison
outside_rz = pbp[(pbp["epa"].notna()) & (pbp["yardline_100"] > 20)]
print(f"\nRed Zone EPA: {red_zone['epa'].mean():.3f}")
print(f"Outside RZ EPA: {outside_rz['epa'].mean():.3f}")
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Third down analysis
third_down <- pbp %>%
filter(!is.na(epa), down == 3)
# By distance bucket
third_by_dist <- third_down %>%
mutate(
dist_bucket = case_when(
ydstogo <= 3 ~ "Short (1-3)",
ydstogo <= 6 ~ "Medium (4-6)",
ydstogo <= 10 ~ "Long (7-10)",
TRUE ~ "Very Long (11+)"
)
) %>%
group_by(dist_bucket) %>%
summarize(
attempts = n(),
conversion_rate = mean(first_down, na.rm = TRUE),
avg_epa = mean(epa),
pass_rate = mean(play_type == "pass", na.rm = TRUE)
)
print(third_by_dist)
# Team third down rankings
team_3rd <- third_down %>%
group_by(posteam) %>%
summarize(
attempts = n(),
conversions = sum(first_down, na.rm = TRUE),
conv_rate = mean(first_down, na.rm = TRUE),
epa_per_play = mean(epa)
) %>%
arrange(desc(conv_rate))
print(team_3rd)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Third down plays
third_down = pbp[(pbp["epa"].notna()) & (pbp["down"] == 3)]
# Distance buckets
def dist_bucket(ydstogo):
if ydstogo <= 3: return "Short (1-3)"
elif ydstogo <= 6: return "Medium (4-6)"
elif ydstogo <= 10: return "Long (7-10)"
else: return "Very Long (11+)"
third_down["dist_bucket"] = third_down["ydstogo"].apply(dist_bucket)
# Analysis by distance
third_by_dist = (third_down.groupby("dist_bucket")
.agg(
attempts=("epa", "count"),
conversion_rate=("first_down", "mean"),
avg_epa=("epa", "mean")
)
.reset_index())
print("Third Down by Distance:")
print(third_by_dist)
# Team rankings
team_3rd = (third_down.groupby("posteam")
.agg(
attempts=("epa", "count"),
conv_rate=("first_down", "mean"),
epa_per_play=("epa", "mean")
)
.reset_index()
.sort_values("conv_rate", ascending=False))
print("\nTeam Third Down Rankings:")
print(team_3rd.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Define garbage time (various definitions)
# Common: Win prob < 10% or > 90% in 4th quarter
pbp_filtered <- pbp %>%
filter(!is.na(epa)) %>%
mutate(
garbage_time = case_when(
qtr == 4 & (wp < 0.10 | wp > 0.90) ~ TRUE,
qtr == 4 & abs(score_differential) > 21 ~ TRUE,
TRUE ~ FALSE
)
)
# Compare EPA with and without garbage time
comparison <- pbp_filtered %>%
filter(play_type %in% c("pass", "run")) %>%
group_by(posteam, garbage_time) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
.groups = "drop"
) %>%
pivot_wider(names_from = garbage_time, values_from = c(plays, epa_per_play))
# Teams with biggest garbage time inflation
comparison %>%
mutate(epa_diff = epa_per_play_TRUE - epa_per_play_FALSE) %>%
arrange(desc(epa_diff)) %>%
select(posteam, epa_per_play_FALSE, epa_per_play_TRUE, epa_diff)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter valid plays
plays = pbp[pbp["epa"].notna()].copy()
# Define garbage time
plays["garbage_time"] = (
((plays["qtr"] == 4) & ((plays["wp"] < 0.10) | (plays["wp"] > 0.90))) |
((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21))
)
# Compare with and without garbage time
plays_filtered = plays[plays["play_type"].isin(["pass", "run"])]
# EPA without garbage time
clean_epa = (plays_filtered[~plays_filtered["garbage_time"]]
.groupby("posteam")["epa"].mean())
# EPA with garbage time
all_epa = plays_filtered.groupby("posteam")["epa"].mean()
# Compare
comparison = pd.DataFrame({
"clean_epa": clean_epa,
"all_epa": all_epa
})
comparison["garbage_inflation"] = comparison["all_epa"] - comparison["clean_epa"]
comparison = comparison.sort_values("garbage_inflation", ascending=False)
print("Garbage Time EPA Inflation:")
print(comparison.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load two consecutive seasons
pbp_2022 <- load_pbp(2022)
pbp_2023 <- load_pbp(2023)
# Calculate team EPA for each season
get_team_epa <- function(pbp, season) {
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(
epa_per_play = mean(epa),
pass_epa = mean(epa[play_type == "pass"]),
rush_epa = mean(epa[play_type == "run"]),
.groups = "drop"
) %>%
mutate(season = season)
}
epa_2022 <- get_team_epa(pbp_2022, 2022)
epa_2023 <- get_team_epa(pbp_2023, 2023)
# Join and calculate correlation
combined <- epa_2022 %>%
inner_join(epa_2023, by = "posteam", suffix = c("_2022", "_2023"))
# Correlations
cat("Overall EPA correlation:", cor(combined$epa_per_play_2022, combined$epa_per_play_2023), "\n")
cat("Pass EPA correlation:", cor(combined$pass_epa_2022, combined$pass_epa_2023), "\n")
cat("Rush EPA correlation:", cor(combined$rush_epa_2022, combined$rush_epa_2023), "\n")
import nfl_data_py as nfl
import pandas as pd
# Load two consecutive seasons
pbp_2022 = nfl.import_pbp_data([2022])
pbp_2023 = nfl.import_pbp_data([2023])
def get_team_epa(pbp):
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))]
return plays.groupby("posteam")["epa"].mean()
epa_2022 = get_team_epa(pbp_2022)
epa_2023 = get_team_epa(pbp_2023)
# Combine
combined = pd.DataFrame({
"epa_2022": epa_2022,
"epa_2023": epa_2023
}).dropna()
# Calculate correlation
correlation = combined["epa_2022"].corr(combined["epa_2023"])
print(f"Year-over-Year EPA Correlation: {correlation:.3f}")
# Show biggest changes
combined["change"] = combined["epa_2023"] - combined["epa_2022"]
print("\nBiggest Improvements:")
print(combined.nlargest(5, "change"))
print("\nBiggest Declines:")
print(combined.nsmallest(5, "change"))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Define explosive plays (20+ yards)
explosive <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
mutate(
explosive = yards_gained >= 20,
big_play = yards_gained >= 40
)
# Team explosive play rates
team_explosive <- explosive %>%
group_by(posteam) %>%
summarize(
total_plays = n(),
explosive_plays = sum(explosive),
explosive_rate = mean(explosive) * 100,
explosive_epa = sum(epa[explosive]),
non_explosive_epa = sum(epa[!explosive]),
epa_from_explosive_pct = explosive_epa / sum(epa) * 100
) %>%
arrange(desc(explosive_rate))
print(team_explosive)
# Explosive vs non-explosive EPA comparison
cat("\nAverage EPA on explosive plays:",
mean(explosive$epa[explosive$explosive]), "\n")
cat("Average EPA on non-explosive plays:",
mean(explosive$epa[!explosive$explosive]), "\n")
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter valid plays
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))].copy()
# Define explosive plays
plays["explosive"] = plays["yards_gained"] >= 20
# Team explosive play analysis
team_explosive = (plays.groupby("posteam")
.agg(
total_plays=("epa", "count"),
explosive_plays=("explosive", "sum"),
total_epa=("epa", "sum")
)
.reset_index())
team_explosive["explosive_rate"] = (team_explosive["explosive_plays"] /
team_explosive["total_plays"] * 100)
team_explosive = team_explosive.sort_values("explosive_rate", ascending=False)
print("Team Explosive Play Rates:")
print(team_explosive)
# EPA comparison
explosive_epa = plays[plays["explosive"]]["epa"].mean()
non_explosive_epa = plays[~plays["explosive"]]["epa"].mean()
print(f"\nAverage EPA on explosive plays: {explosive_epa:.3f}")
print(f"Average EPA on non-explosive plays: {non_explosive_epa:.3f}")
nflfastR
tidyverse
nfl_data_py
pandas
Analyze win probability, WPA, and clutch performance metrics
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Win probability is already calculated in pbp data
# Key columns: wp, vegas_wp, home_wp, away_wp
# View WP at key moments
key_plays <- pbp %>%
filter(!is.na(wp), !is.na(desc)) %>%
select(game_id, qtr, time, posteam, desc, wp, wpa) %>%
head(20)
print(key_plays)
# Average WP by score differential
wp_by_score <- pbp %>%
filter(!is.na(wp), !is.na(score_differential)) %>%
group_by(score_differential) %>%
summarize(
avg_wp = mean(wp),
plays = n()
) %>%
filter(plays >= 100, abs(score_differential) <= 21)
print(wp_by_score)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Win probability columns: wp, vegas_wp, home_wp, away_wp
# View key WP columns
wp_cols = ["game_id", "qtr", "time", "posteam", "wp", "wpa"]
key_plays = pbp[pbp["wp"].notna()][wp_cols].head(20)
print("Sample Win Probability Data:")
print(key_plays)
# Average WP by score differential
wp_by_score = (pbp[pbp["wp"].notna() & pbp["score_differential"].notna()]
.groupby("score_differential")
.agg(avg_wp=("wp", "mean"), plays=("wp", "count"))
.reset_index())
wp_by_score = wp_by_score[
(wp_by_score["plays"] >= 100) &
(wp_by_score["score_differential"].abs() <= 21)
]
print("\nAverage WP by Score Differential:")
print(wp_by_score)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# QB WPA leaders
qb_wpa <- pbp %>%
filter(!is.na(wpa), !is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
dropbacks = n(),
total_wpa = sum(wpa),
positive_plays = sum(wpa > 0),
negative_plays = sum(wpa < 0),
.groups = "drop"
) %>%
filter(dropbacks >= 200) %>%
arrange(desc(total_wpa))
print(qb_wpa)
# Single biggest WPA plays of the season
pbp %>%
filter(!is.na(wpa)) %>%
arrange(desc(abs(wpa))) %>%
select(game_id, posteam, qtr, desc, wpa) %>%
head(10)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# QB WPA leaders
qb_plays = pbp[pbp["wpa"].notna() & pbp["passer_player_id"].notna()]
qb_wpa = (qb_plays.groupby(["passer_player_id", "passer_player_name"])
.agg(
dropbacks=("wpa", "count"),
total_wpa=("wpa", "sum"),
positive_plays=("wpa", lambda x: (x > 0).sum()),
negative_plays=("wpa", lambda x: (x < 0).sum())
)
.reset_index())
qb_wpa = qb_wpa[qb_wpa["dropbacks"] >= 200].sort_values("total_wpa", ascending=False)
print("QB WPA Leaders:")
print(qb_wpa)
# Biggest WPA plays
biggest_plays = (pbp[pbp["wpa"].notna()]
.assign(abs_wpa=lambda x: x["wpa"].abs())
.nlargest(10, "abs_wpa")
[["game_id", "posteam", "qtr", "desc", "wpa"]])
print("\nBiggest WPA Plays:")
print(biggest_plays)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Define clutch situations: 4th quarter, WP between 25-75%
clutch_plays <- pbp %>%
filter(
!is.na(epa),
qtr == 4,
wp >= 0.25 & wp <= 0.75
)
# QB clutch performance
qb_clutch <- clutch_plays %>%
filter(!is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
clutch_plays = n(),
clutch_epa = sum(epa),
clutch_wpa = sum(wpa, na.rm = TRUE),
clutch_success = mean(success),
.groups = "drop"
) %>%
filter(clutch_plays >= 30) %>%
arrange(desc(clutch_wpa))
print(qb_clutch)
# Compare clutch vs non-clutch
pbp %>%
filter(!is.na(passer_player_id), !is.na(epa)) %>%
mutate(
is_clutch = qtr == 4 & wp >= 0.25 & wp <= 0.75
) %>%
group_by(passer_player_name, is_clutch) %>%
summarize(epa_per_play = mean(epa), .groups = "drop") %>%
pivot_wider(names_from = is_clutch, values_from = epa_per_play)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Define clutch situations
clutch_plays = pbp[
(pbp["epa"].notna()) &
(pbp["qtr"] == 4) &
(pbp["wp"] >= 0.25) &
(pbp["wp"] <= 0.75)
]
# QB clutch performance
qb_clutch = (clutch_plays[clutch_plays["passer_player_id"].notna()]
.groupby(["passer_player_id", "passer_player_name"])
.agg(
clutch_plays=("epa", "count"),
clutch_epa=("epa", "sum"),
clutch_wpa=("wpa", "sum"),
clutch_success=("success", "mean")
)
.reset_index())
qb_clutch = qb_clutch[qb_clutch["clutch_plays"] >= 30].sort_values(
"clutch_wpa", ascending=False)
print("QB Clutch Performance (4Q, WP 25-75%):")
print(qb_clutch)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Find biggest swing plays (largest WPA changes)
swing_plays <- pbp %>%
filter(!is.na(wpa)) %>%
mutate(abs_wpa = abs(wpa)) %>%
arrange(desc(abs_wpa)) %>%
select(game_id, week, posteam, qtr, time, down, ydstogo,
desc, wpa, wp) %>%
head(50)
print(swing_plays)
# Game-changing interceptions
ints <- pbp %>%
filter(interception == 1, !is.na(wpa)) %>%
arrange(desc(abs(wpa))) %>%
select(game_id, passer_player_name, interception_player_name,
wpa, wp, desc) %>%
head(10)
print(ints)
# Game-changing touchdowns
tds <- pbp %>%
filter(touchdown == 1, !is.na(wpa)) %>%
arrange(desc(wpa)) %>%
select(game_id, posteam, qtr, wpa, wp, desc) %>%
head(10)
print(tds)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Find biggest swing plays
swing_plays = pbp[pbp["wpa"].notna()].copy()
swing_plays["abs_wpa"] = swing_plays["wpa"].abs()
swing_plays = swing_plays.nlargest(50, "abs_wpa")[
["game_id", "week", "posteam", "qtr", "time", "down",
"ydstogo", "desc", "wpa", "wp"]
]
print("Biggest Swing Plays:")
print(swing_plays.head(20))
# Game-changing interceptions
ints = (pbp[(pbp["interception"] == 1) & (pbp["wpa"].notna())]
.assign(abs_wpa=lambda x: x["wpa"].abs())
.nlargest(10, "abs_wpa")
[["game_id", "passer_player_name", "interception_player_name", "wpa", "wp"]])
print("\nBiggest Interceptions by WPA:")
print(ints)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Find games with 4th quarter comebacks
comebacks <- pbp %>%
filter(qtr == 4) %>%
group_by(game_id, home_team, away_team) %>%
summarize(
start_wp_home = first(home_wp),
end_wp_home = last(home_wp),
home_score = last(total_home_score),
away_score = last(total_away_score),
.groups = "drop"
) %>%
mutate(
home_won = home_score > away_score,
comeback = (start_wp_home < 0.25 & home_won) |
(start_wp_home > 0.75 & !home_won),
wp_swing = abs(end_wp_home - start_wp_home)
) %>%
filter(comeback) %>%
arrange(desc(wp_swing))
print(comebacks)
# Teams with most comebacks
comeback_counts <- comebacks %>%
mutate(
comeback_team = if_else(home_won, home_team, away_team)
) %>%
count(comeback_team, sort = TRUE)
print(comeback_counts)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter to 4th quarter plays
q4 = pbp[pbp["qtr"] == 4].copy()
# Get start and end WP for each game
game_wp = (q4.groupby(["game_id", "home_team", "away_team"])
.agg(
start_wp_home=("home_wp", "first"),
end_wp_home=("home_wp", "last"),
home_score=("total_home_score", "last"),
away_score=("total_away_score", "last")
)
.reset_index())
game_wp["home_won"] = game_wp["home_score"] > game_wp["away_score"]
game_wp["comeback"] = (
((game_wp["start_wp_home"] < 0.25) & game_wp["home_won"]) |
((game_wp["start_wp_home"] > 0.75) & ~game_wp["home_won"])
)
game_wp["wp_swing"] = (game_wp["end_wp_home"] - game_wp["start_wp_home"]).abs()
comebacks = game_wp[game_wp["comeback"]].sort_values("wp_swing", ascending=False)
print("4th Quarter Comebacks:")
print(comebacks)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Select a specific game (e.g., Super Bowl or exciting game)
game <- pbp %>%
filter(week == 1, home_team == "KC") %>%
filter(!is.na(wp))
# Create WP chart data
wp_chart <- game %>%
select(game_seconds_remaining, wp, posteam, home_team, away_team) %>%
mutate(
game_time = 3600 - game_seconds_remaining,
home_wp = if_else(posteam == home_team, wp, 1 - wp)
)
# Plot
ggplot(wp_chart, aes(x = game_time, y = home_wp)) +
geom_line(color = "#E31837", size = 1) +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray50") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(
breaks = c(0, 900, 1800, 2700, 3600),
labels = c("Q1", "Q2", "Q3", "Q4", "End")
) +
labs(
title = paste(unique(game$away_team), "@", unique(game$home_team)),
x = "Game Time",
y = "Home Win Probability"
) +
theme_minimal()
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
pbp = nfl.import_pbp_data([2023])
# Select a specific game
game = pbp[(pbp["week"] == 1) & (pbp["home_team"] == "KC")]
game = game[game["wp"].notna()].copy()
# Calculate game time and home WP
game["game_time"] = 3600 - game["game_seconds_remaining"]
game["home_wp"] = game.apply(
lambda x: x["wp"] if x["posteam"] == x["home_team"] else 1 - x["wp"],
axis=1
)
# Create plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(game["game_time"], game["home_wp"], color="#E31837", linewidth=2)
ax.axhline(y=0.5, color="gray", linestyle="--", alpha=0.5)
ax.set_xlim(0, 3600)
ax.set_ylim(0, 1)
ax.set_xticks([0, 900, 1800, 2700, 3600])
ax.set_xticklabels(["Q1", "Q2", "Q3", "Q4", "End"])
ax.set_ylabel("Home Win Probability")
ax.set_title(f"{game['away_team'].iloc[0]} @ {game['home_team'].iloc[0]}")
plt.tight_layout()
plt.savefig("wp_chart.png", dpi=300)
plt.show()
nflfastR
tidyverse
ggplot2
nfl_data_py
pandas
matplotlib
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Correlation between EPA and WPA
plays <- pbp %>%
filter(!is.na(epa), !is.na(wpa))
cat("Correlation EPA vs WPA:", cor(plays$epa, plays$wpa), "\n")
# EPA vs WPA by game situation
situation_comparison <- plays %>%
mutate(
situation = case_when(
qtr <= 2 ~ "First Half",
wp > 0.75 | wp < 0.25 ~ "Blowout",
TRUE ~ "Competitive"
)
) %>%
group_by(situation) %>%
summarize(
plays = n(),
avg_epa = mean(epa),
avg_wpa = mean(wpa),
epa_wpa_cor = cor(epa, wpa)
)
print(situation_comparison)
# High EPA but low WPA (garbage time) vs low EPA high WPA (clutch)
plays %>%
mutate(
epa_quartile = ntile(epa, 4),
wpa_quartile = ntile(wpa, 4)
) %>%
count(epa_quartile, wpa_quartile) %>%
pivot_wider(names_from = wpa_quartile, values_from = n)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Filter plays with both EPA and WPA
plays = pbp[(pbp["epa"].notna()) & (pbp["wpa"].notna())]
# Overall correlation
correlation = plays["epa"].corr(plays["wpa"])
print(f"EPA vs WPA Correlation: {correlation:.3f}")
# By game situation
def get_situation(row):
if row["qtr"] <= 2:
return "First Half"
elif row["wp"] > 0.75 or row["wp"] < 0.25:
return "Blowout"
else:
return "Competitive"
plays["situation"] = plays.apply(get_situation, axis=1)
situation_comparison = (plays.groupby("situation")
.agg(
plays=("epa", "count"),
avg_epa=("epa", "mean"),
avg_wpa=("wpa", "mean")
)
.reset_index())
print("\nEPA vs WPA by Situation:")
print(situation_comparison)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Fourth down decisions in close 4th quarter games
late_4th_downs <- pbp %>%
filter(
down == 4,
qtr == 4,
wp >= 0.20 & wp <= 0.80,
!is.na(fourth_down_decision)
)
# Decision distribution
late_4th_downs %>%
count(fourth_down_decision)
# WP impact by decision
decision_impact <- late_4th_downs %>%
group_by(fourth_down_decision) %>%
summarize(
attempts = n(),
avg_wpa = mean(wpa, na.rm = TRUE),
success_rate = mean(fourth_down_converted, na.rm = TRUE),
avg_wp_before = mean(wp)
)
print(decision_impact)
# Aggressive vs conservative coaches
coach_decisions <- late_4th_downs %>%
group_by(posteam) %>%
summarize(
fourth_downs = n(),
go_for_it_rate = mean(fourth_down_decision == "go"),
punt_rate = mean(fourth_down_decision == "punt"),
fg_rate = mean(fourth_down_decision == "field_goal")
) %>%
arrange(desc(go_for_it_rate))
print(coach_decisions)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Fourth down decisions in close 4th quarter games
late_4th = pbp[
(pbp["down"] == 4) &
(pbp["qtr"] == 4) &
(pbp["wp"] >= 0.20) &
(pbp["wp"] <= 0.80) &
(pbp["fourth_down_decision"].notna())
]
# Decision distribution
print("4th Down Decision Distribution:")
print(late_4th["fourth_down_decision"].value_counts())
# WP impact by decision
decision_impact = (late_4th.groupby("fourth_down_decision")
.agg(
attempts=("wpa", "count"),
avg_wpa=("wpa", "mean"),
avg_wp_before=("wp", "mean")
)
.reset_index())
print("\nDecision Impact:")
print(decision_impact)
# Team aggressiveness
team_decisions = (late_4th.groupby("posteam")
.agg(fourth_downs=("down", "count"))
.reset_index())
go_counts = late_4th[late_4th["fourth_down_decision"] == "go"].groupby("posteam").size()
team_decisions = team_decisions.merge(
go_counts.reset_index(name="go_for_it"),
on="posteam", how="left"
).fillna(0)
team_decisions["go_rate"] = team_decisions["go_for_it"] / team_decisions["fourth_downs"]
print("\nTeam Aggressiveness:")
print(team_decisions.sort_values("go_rate", ascending=False))
nflfastR
tidyverse
nfl_data_py
pandas
Create publication-quality charts and visualizations for NFL data
library(nflfastR)
library(nflplotR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate team EPA
team_epa <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(off_epa = mean(epa)) %>%
left_join(
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(defteam) %>%
summarize(def_epa = mean(epa)),
by = c("posteam" = "defteam")
)
# Create plot with logos
ggplot(team_epa, aes(x = off_epa, y = def_epa)) +
geom_mean_lines(aes(y0 = def_epa, x0 = off_epa)) +
geom_nfl_logos(aes(team_abbr = posteam), width = 0.065) +
labs(
title = "NFL Team Efficiency (2023)",
subtitle = "Offensive vs Defensive EPA/Play",
x = "Offensive EPA/Play (higher is better)",
y = "Defensive EPA/Play (lower is better)",
caption = "Data: nflfastR"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.title = element_text(size = 12)
)
ggsave("epa_scatter.png", width = 10, height = 8, dpi = 300)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Calculate team EPA
plays = pbp[(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))]
off_epa = plays.groupby("posteam")["epa"].mean().reset_index()
off_epa.columns = ["team", "off_epa"]
def_epa = plays.groupby("defteam")["epa"].mean().reset_index()
def_epa.columns = ["team", "def_epa"]
team_epa = off_epa.merge(def_epa, on="team")
# Create scatter plot
fig, ax = plt.subplots(figsize=(12, 10))
ax.scatter(team_epa["off_epa"], team_epa["def_epa"], s=100, alpha=0.7)
# Add team labels
for idx, row in team_epa.iterrows():
ax.annotate(row["team"], (row["off_epa"], row["def_epa"]),
fontsize=9, ha="center", va="bottom")
# Add mean lines
ax.axhline(y=team_epa["def_epa"].mean(), color="gray",
linestyle="--", alpha=0.5)
ax.axvline(x=team_epa["off_epa"].mean(), color="gray",
linestyle="--", alpha=0.5)
ax.set_xlabel("Offensive EPA/Play (higher is better)", fontsize=12)
ax.set_ylabel("Defensive EPA/Play (lower is better)", fontsize=12)
ax.set_title("NFL Team Efficiency (2023)", fontsize=16, fontweight="bold")
ax.invert_yaxis() # Lower def EPA is better
plt.tight_layout()
plt.savefig("epa_scatter.png", dpi=300)
plt.show()
nflfastR
nflplotR
ggplot2
tidyverse
matplotlib
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Get weekly EPA for specific teams
teams_of_interest <- c("KC", "SF", "BAL", "DET")
weekly_epa <- pbp %>%
filter(!is.na(epa), posteam %in% teams_of_interest) %>%
group_by(posteam, week) %>%
summarize(epa_per_play = mean(epa), .groups = "drop")
# Create line chart
ggplot(weekly_epa, aes(x = week, y = epa_per_play,
color = posteam, group = posteam)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("KC" = "#E31837", "SF" = "#AA0000",
"BAL" = "#241773", "DET" = "#0076B6")) +
labs(
title = "Weekly EPA/Play Trends (2023)",
x = "Week",
y = "EPA/Play",
color = "Team"
) +
theme_minimal() +
theme(legend.position = "bottom")
ggsave("weekly_epa.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
pbp = nfl.import_pbp_data([2023])
# Get weekly EPA for specific teams
teams = ["KC", "SF", "BAL", "DET"]
team_colors = {"KC": "#E31837", "SF": "#AA0000",
"BAL": "#241773", "DET": "#0076B6"}
plays = pbp[(pbp["epa"].notna()) & (pbp["posteam"].isin(teams))]
weekly_epa = plays.groupby(["posteam", "week"])["epa"].mean().reset_index()
# Create line chart
fig, ax = plt.subplots(figsize=(12, 6))
for team in teams:
team_data = weekly_epa[weekly_epa["posteam"] == team]
ax.plot(team_data["week"], team_data["epa"],
color=team_colors[team], label=team,
linewidth=2, marker="o", markersize=5)
ax.axhline(y=0, color="gray", linestyle="--", alpha=0.5)
ax.set_xlabel("Week", fontsize=12)
ax.set_ylabel("EPA/Play", fontsize=12)
ax.set_title("Weekly EPA/Play Trends (2023)", fontsize=14, fontweight="bold")
ax.legend(loc="lower right")
plt.tight_layout()
plt.savefig("weekly_epa.png", dpi=300)
plt.show()
nflfastR
ggplot2
tidyverse
matplotlib
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Get pass data for a specific QB
qb_passes <- pbp %>%
filter(
passer_player_name == "P.Mahomes",
!is.na(air_yards),
!is.na(pass_location)
) %>%
mutate(
x = case_when(
pass_location == "left" ~ -10,
pass_location == "middle" ~ 0,
pass_location == "right" ~ 10
),
y = air_yards,
result = case_when(
interception == 1 ~ "INT",
complete_pass == 1 ~ "Complete",
TRUE ~ "Incomplete"
)
)
# Create pass chart
ggplot(qb_passes, aes(x = x, y = y, color = result)) +
geom_point(alpha = 0.6, size = 2) +
geom_hline(yintercept = 0, color = "darkgreen", size = 2) +
scale_color_manual(values = c("Complete" = "#00AA00",
"Incomplete" = "#AA0000",
"INT" = "#000000")) +
coord_cartesian(xlim = c(-20, 20), ylim = c(-5, 40)) +
labs(
title = "Patrick Mahomes Pass Chart (2023)",
x = "Field Width",
y = "Air Yards",
color = "Result"
) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
legend.position = "right"
)
ggsave("pass_chart.png", width = 8, height = 10)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
pbp = nfl.import_pbp_data([2023])
# Get pass data for a specific QB
qb_passes = pbp[
(pbp["passer_player_name"] == "P.Mahomes") &
(pbp["air_yards"].notna()) &
(pbp["pass_location"].notna())
].copy()
# Map pass location to x coordinate
location_map = {"left": -10, "middle": 0, "right": 10}
qb_passes["x"] = qb_passes["pass_location"].map(location_map)
qb_passes["y"] = qb_passes["air_yards"]
# Determine result
def get_result(row):
if row["interception"] == 1: return "INT"
elif row["complete_pass"] == 1: return "Complete"
else: return "Incomplete"
qb_passes["result"] = qb_passes.apply(get_result, axis=1)
# Create pass chart
fig, ax = plt.subplots(figsize=(8, 10))
colors = {"Complete": "#00AA00", "Incomplete": "#AA0000", "INT": "#000000"}
for result, color in colors.items():
data = qb_passes[qb_passes["result"] == result]
ax.scatter(data["x"], data["y"], c=color, label=result,
alpha=0.6, s=30)
ax.axhline(y=0, color="darkgreen", linewidth=3)
ax.set_xlim(-20, 20)
ax.set_ylim(-5, 40)
ax.set_xlabel("Field Width")
ax.set_ylabel("Air Yards")
ax.set_title("Patrick Mahomes Pass Chart (2023)", fontweight="bold")
ax.legend()
plt.tight_layout()
plt.savefig("pass_chart.png", dpi=300)
plt.show()
nflfastR
ggplot2
tidyverse
matplotlib
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Create EPA heatmap by down and distance bucket
heatmap_data <- pbp %>%
filter(!is.na(epa), down <= 4, !is.na(ydstogo)) %>%
mutate(
distance_bucket = case_when(
ydstogo <= 3 ~ "1-3",
ydstogo <= 6 ~ "4-6",
ydstogo <= 10 ~ "7-10",
TRUE ~ "10+"
)
) %>%
group_by(down, distance_bucket) %>%
summarize(
avg_epa = mean(epa),
plays = n(),
.groups = "drop"
)
# Create heatmap
ggplot(heatmap_data, aes(x = distance_bucket, y = factor(down), fill = avg_epa)) +
geom_tile(color = "white", size = 0.5) +
geom_text(aes(label = round(avg_epa, 3)), color = "white", size = 4) +
scale_fill_gradient2(low = "#BB0000", mid = "gray90", high = "#006400",
midpoint = 0, name = "EPA/Play") +
labs(
title = "EPA by Down and Distance (2023)",
x = "Yards to Go",
y = "Down"
) +
theme_minimal() +
theme(panel.grid = element_blank())
ggsave("epa_heatmap.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Filter valid plays
plays = pbp[(pbp["epa"].notna()) & (pbp["down"] <= 4) & (pbp["ydstogo"].notna())]
# Create distance buckets
def distance_bucket(yd):
if yd <= 3: return "1-3"
elif yd <= 6: return "4-6"
elif yd <= 10: return "7-10"
else: return "10+"
plays["distance_bucket"] = plays["ydstogo"].apply(distance_bucket)
# Calculate EPA by down and distance
heatmap_data = plays.groupby(["down", "distance_bucket"])["epa"].mean().reset_index()
pivot_data = heatmap_data.pivot(index="down", columns="distance_bucket", values="epa")
# Reorder columns
col_order = ["1-3", "4-6", "7-10", "10+"]
pivot_data = pivot_data[[c for c in col_order if c in pivot_data.columns]]
# Create heatmap
fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(pivot_data, annot=True, fmt=".3f", cmap="RdYlGn", center=0,
linewidths=0.5, ax=ax, cbar_kws={"label": "EPA/Play"})
ax.set_xlabel("Yards to Go")
ax.set_ylabel("Down")
ax.set_title("EPA by Down and Distance (2023)", fontweight="bold")
plt.tight_layout()
plt.savefig("epa_heatmap.png", dpi=300)
plt.show()
nflfastR
ggplot2
tidyverse
seaborn
matplotlib
pandas
library(nflfastR)
library(tidyverse)
library(fmsb) # For radar charts
pbp <- load_pbp(2023)
# Calculate QB metrics
qb_metrics <- pbp %>%
filter(!is.na(passer_player_id), play_type == "pass") %>%
group_by(passer_player_name) %>%
summarize(
attempts = n(),
epa_play = mean(epa, na.rm = TRUE),
cpoe = mean(cpoe, na.rm = TRUE),
avg_air = mean(air_yards, na.rm = TRUE),
deep_pct = mean(air_yards >= 20, na.rm = TRUE) * 100,
sack_rate = mean(sack == 1, na.rm = TRUE) * 100,
.groups = "drop"
) %>%
filter(attempts >= 300)
# Normalize to 0-100 scale for radar
normalize <- function(x) (x - min(x)) / (max(x) - min(x)) * 100
qb_radar <- qb_metrics %>%
mutate(across(c(epa_play, cpoe, avg_air, deep_pct), normalize)) %>%
mutate(sack_rate = 100 - normalize(sack_rate)) # Invert - lower is better
# Get top QBs for comparison
top_qbs <- c("P.Mahomes", "J.Allen", "L.Jackson")
radar_data <- qb_radar %>%
filter(passer_player_name %in% top_qbs) %>%
select(passer_player_name, epa_play, cpoe, avg_air, deep_pct, sack_rate)
print(radar_data)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import pi
pbp = nfl.import_pbp_data([2023])
# Calculate QB metrics
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["play_type"] == "pass")]
qb_metrics = (passes.groupby("passer_player_name")
.agg(
attempts=("play_id", "count"),
epa_play=("epa", "mean"),
cpoe=("cpoe", "mean"),
avg_air=("air_yards", "mean"),
deep_pct=("air_yards", lambda x: (x >= 20).mean() * 100),
sack_rate=("sack", "mean")
)
.reset_index())
qb_metrics = qb_metrics[qb_metrics["attempts"] >= 300]
# Normalize to 0-100
def normalize(col):
return (col - col.min()) / (col.max() - col.min()) * 100
for col in ["epa_play", "cpoe", "avg_air", "deep_pct"]:
qb_metrics[col + "_norm"] = normalize(qb_metrics[col])
# Invert sack rate (lower is better)
qb_metrics["sack_norm"] = 100 - normalize(qb_metrics["sack_rate"] * 100)
# Create radar chart for top QBs
categories = ["EPA/Play", "CPOE", "Air Yards", "Deep%", "Sack Avoidance"]
qbs = ["P.Mahomes", "J.Allen", "L.Jackson"]
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
angles += angles[:1]
for qb in qbs:
data = qb_metrics[qb_metrics["passer_player_name"] == qb]
if not data.empty:
values = [data["epa_play_norm"].iloc[0], data["cpoe_norm"].iloc[0],
data["avg_air_norm"].iloc[0], data["deep_pct_norm"].iloc[0],
data["sack_norm"].iloc[0]]
values += values[:1]
ax.plot(angles, values, linewidth=2, label=qb)
ax.fill(angles, values, alpha=0.1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_title("QB Comparison Radar Chart", fontweight="bold")
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
plt.tight_layout()
plt.savefig("qb_radar.png", dpi=300)
plt.show()
nflfastR
tidyverse
fmsb
matplotlib
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# EPA distribution by play type
play_epa <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run"))
# Create box plot
ggplot(play_epa, aes(x = play_type, y = epa, fill = play_type)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
coord_cartesian(ylim = c(-3, 3)) +
scale_fill_manual(values = c("pass" = "#1E90FF", "run" = "#228B22")) +
labs(
title = "EPA Distribution by Play Type (2023)",
x = "Play Type",
y = "EPA",
fill = "Type"
) +
theme_minimal() +
theme(legend.position = "none")
ggsave("epa_boxplot.png", width = 8, height = 6)
# Box plot by down
ggplot(play_epa %>% filter(!is.na(down), down <= 4),
aes(x = factor(down), y = epa, fill = factor(down))) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
coord_cartesian(ylim = c(-2, 2)) +
labs(title = "EPA by Down", x = "Down", y = "EPA") +
theme_minimal() +
theme(legend.position = "none")
ggsave("epa_by_down.png", width = 8, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pbp = nfl.import_pbp_data([2023])
# Filter plays with EPA
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
# Create box plot by play type
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot 1: By play type
sns.boxplot(data=plays, x="play_type", y="epa", ax=axes[0],
palette={"pass": "#1E90FF", "run": "#228B22"},
showfliers=False)
axes[0].set_ylim(-3, 3)
axes[0].set_xlabel("Play Type")
axes[0].set_ylabel("EPA")
axes[0].set_title("EPA Distribution by Play Type (2023)")
# Plot 2: By down
plays_by_down = plays[(plays["down"].notna()) & (plays["down"] <= 4)]
sns.boxplot(data=plays_by_down, x="down", y="epa", ax=axes[1],
palette="viridis", showfliers=False)
axes[1].set_ylim(-2, 2)
axes[1].set_xlabel("Down")
axes[1].set_ylabel("EPA")
axes[1].set_title("EPA Distribution by Down (2023)")
plt.tight_layout()
plt.savefig("epa_boxplots.png", dpi=300)
plt.show()
nflfastR
ggplot2
tidyverse
seaborn
matplotlib
pandas
library(nflfastR)
library(tidyverse)
library(plotly)
pbp <- load_pbp(2023)
# Calculate team stats
team_stats <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(
off_epa = mean(epa),
pass_rate = mean(play_type == "pass") * 100,
plays = n(),
.groups = "drop"
) %>%
left_join(
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(defteam) %>%
summarize(def_epa = mean(epa)),
by = c("posteam" = "defteam")
)
# Create interactive scatter plot
p <- plot_ly(team_stats,
x = ~off_epa,
y = ~def_epa,
text = ~paste("Team:", posteam,
"<br>Off EPA:", round(off_epa, 3),
"<br>Def EPA:", round(def_epa, 3),
"<br>Pass Rate:", round(pass_rate, 1), "%"),
hoverinfo = "text",
type = "scatter",
mode = "markers",
marker = list(size = 12, opacity = 0.8)
) %>%
layout(
title = "NFL Team Efficiency (2023)",
xaxis = list(title = "Offensive EPA/Play"),
yaxis = list(title = "Defensive EPA/Play", autorange = "reversed")
)
# Save or display
htmlwidgets::saveWidget(p, "team_efficiency.html")
import nfl_data_py as nfl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
pbp = nfl.import_pbp_data([2023])
# Calculate team stats
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
off_stats = (plays.groupby("posteam")
.agg(
off_epa=("epa", "mean"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
plays=("epa", "count")
)
.reset_index())
def_stats = (plays.groupby("defteam")
.agg(def_epa=("epa", "mean"))
.reset_index())
team_stats = off_stats.merge(def_stats, left_on="posteam", right_on="defteam")
# Create interactive scatter with Plotly
fig = px.scatter(
team_stats,
x="off_epa",
y="def_epa",
text="posteam",
hover_data={"off_epa": ":.3f", "def_epa": ":.3f", "pass_rate": ":.1f"},
title="NFL Team Efficiency (2023)"
)
fig.update_traces(textposition="top center", marker=dict(size=12))
fig.update_yaxes(autorange="reversed") # Lower def EPA is better
fig.update_layout(
xaxis_title="Offensive EPA/Play (higher is better)",
yaxis_title="Defensive EPA/Play (lower is better)"
)
fig.write_html("team_efficiency.html")
fig.show()
nflfastR
tidyverse
plotly
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# EPA histogram
epa_data <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run"))
ggplot(epa_data, aes(x = epa, fill = play_type)) +
geom_histogram(bins = 50, alpha = 0.6, position = "identity") +
scale_fill_manual(values = c("pass" = "#1E90FF", "run" = "#228B22")) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
coord_cartesian(xlim = c(-5, 5)) +
labs(
title = "Distribution of EPA by Play Type (2023)",
x = "Expected Points Added (EPA)",
y = "Count",
fill = "Play Type"
) +
theme_minimal()
ggsave("epa_histogram.png", width = 10, height = 6)
# Air yards histogram
air_data <- pbp %>%
filter(!is.na(air_yards), play_type == "pass")
ggplot(air_data, aes(x = air_yards)) +
geom_histogram(bins = 40, fill = "#1E90FF", alpha = 0.7) +
geom_vline(aes(xintercept = mean(air_yards)), color = "red",
linetype = "dashed", size = 1) +
labs(
title = "Distribution of Air Yards (2023)",
x = "Air Yards",
y = "Count",
caption = "Red line = mean"
) +
theme_minimal()
ggsave("air_yards_histogram.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
pbp = nfl.import_pbp_data([2023])
# EPA histogram by play type
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Plot 1: EPA distribution
for play_type, color in [("pass", "#1E90FF"), ("run", "#228B22")]:
data = plays[plays["play_type"] == play_type]["epa"]
axes[0].hist(data, bins=50, alpha=0.6, color=color, label=play_type.capitalize())
axes[0].axvline(x=0, color="black", linestyle="--")
axes[0].set_xlim(-5, 5)
axes[0].set_xlabel("EPA")
axes[0].set_ylabel("Count")
axes[0].set_title("Distribution of EPA by Play Type (2023)")
axes[0].legend()
# Plot 2: Air yards distribution
air_data = pbp[(pbp["air_yards"].notna()) & (pbp["play_type"] == "pass")]
axes[1].hist(air_data["air_yards"], bins=40, color="#1E90FF", alpha=0.7)
mean_air = air_data["air_yards"].mean()
axes[1].axvline(x=mean_air, color="red", linestyle="--", label=f"Mean: {mean_air:.1f}")
axes[1].set_xlabel("Air Yards")
axes[1].set_ylabel("Count")
axes[1].set_title("Distribution of Air Yards (2023)")
axes[1].legend()
plt.tight_layout()
plt.savefig("histograms.png", dpi=300)
plt.show()
nflfastR
ggplot2
tidyverse
matplotlib
pandas
library(nflfastR)
library(tidyverse)
library(corrplot)
pbp <- load_pbp(2023)
# Calculate team metrics for correlation
team_metrics <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(
off_epa = mean(epa),
pass_rate = mean(play_type == "pass") * 100,
pass_epa = mean(epa[play_type == "pass"]),
run_epa = mean(epa[play_type == "run"]),
success_rate = mean(success, na.rm = TRUE) * 100,
explosive_rate = mean(epa > 2, na.rm = TRUE) * 100,
.groups = "drop"
)
# Create correlation matrix
cor_matrix <- team_metrics %>%
select(-posteam) %>%
cor(use = "complete.obs")
# Visualize
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.7,
tl.col = "black",
title = "Team Offensive Metrics Correlation",
mar = c(0, 0, 2, 0))
png("correlation_matrix.png", width = 800, height = 800)
corrplot(cor_matrix, method = "color", type = "upper",
addCoef.col = "black", number.cex = 0.8)
dev.off()
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
def calculate_metrics(group):
return pd.Series({
"off_epa": group["epa"].mean(),
"pass_rate": (group["play_type"] == "pass").mean() * 100,
"pass_epa": group[group["play_type"] == "pass"]["epa"].mean(),
"run_epa": group[group["play_type"] == "run"]["epa"].mean(),
"success_rate": group["success"].mean() * 100,
"explosive_rate": (group["epa"] > 2).mean() * 100
})
team_metrics = plays.groupby("posteam").apply(calculate_metrics).reset_index()
# Calculate correlation matrix
metrics_only = team_metrics.drop("posteam", axis=1)
cor_matrix = metrics_only.corr()
# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(cor_matrix, dtype=bool))
sns.heatmap(cor_matrix, mask=mask, annot=True, fmt=".2f",
cmap="coolwarm", center=0, square=True,
linewidths=0.5, ax=ax)
ax.set_title("Team Offensive Metrics Correlation", fontweight="bold")
plt.tight_layout()
plt.savefig("correlation_matrix.png", dpi=300)
plt.show()
nflfastR
tidyverse
corrplot
seaborn
matplotlib
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate rolling EPA for a team
team_rolling <- pbp %>%
filter(posteam == "SF", !is.na(epa), play_type %in% c("pass", "run")) %>%
arrange(game_id, play_id) %>%
mutate(
play_num = row_number(),
rolling_epa = zoo::rollmean(epa, k = 50, fill = NA, align = "right")
)
# Define key game events
key_games <- team_rolling %>%
group_by(game_id, week) %>%
summarize(
play_num = max(play_num),
rolling_epa = last(na.omit(rolling_epa)),
.groups = "drop"
) %>%
filter(week %in% c(1, 8, 18)) # Annotate select weeks
# Create time series with annotations
ggplot(team_rolling, aes(x = play_num, y = rolling_epa)) +
geom_line(color = "#AA0000", size = 1) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
geom_point(data = key_games, color = "black", size = 3) +
geom_text(data = key_games, aes(label = paste("Week", week)),
vjust = -1.5, size = 3) +
labs(
title = "San Francisco 49ers - Rolling EPA/Play (2023)",
subtitle = "50-play rolling average",
x = "Play Number",
y = "Rolling EPA/Play"
) +
theme_minimal()
ggsave("rolling_epa.png", width = 12, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
pbp = nfl.import_pbp_data([2023])
# Calculate rolling EPA for SF
sf_plays = pbp[(pbp["posteam"] == "SF") &
(pbp["epa"].notna()) &
(pbp["play_type"].isin(["pass", "run"]))].copy()
sf_plays = sf_plays.sort_values(["game_id", "play_id"])
sf_plays["play_num"] = range(1, len(sf_plays) + 1)
sf_plays["rolling_epa"] = sf_plays["epa"].rolling(window=50).mean()
# Get key week markers
key_weeks = [1, 8, 18]
annotations = []
for week in key_weeks:
week_data = sf_plays[sf_plays["week"] == week]
if not week_data.empty:
annotations.append({
"week": week,
"play_num": week_data["play_num"].max(),
"rolling_epa": week_data["rolling_epa"].dropna().iloc[-1] if not week_data["rolling_epa"].dropna().empty else 0
})
# Create plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(sf_plays["play_num"], sf_plays["rolling_epa"],
color="#AA0000", linewidth=1.5, label="Rolling EPA")
ax.axhline(y=0, color="gray", linestyle="--", alpha=0.7)
# Add annotations
for ann in annotations:
ax.scatter(ann["play_num"], ann["rolling_epa"], color="black", s=50, zorder=5)
ax.annotate(f"Week {ann[\"week\"]}", (ann["play_num"], ann["rolling_epa"]),
textcoords="offset points", xytext=(0, 10), ha="center")
ax.set_xlabel("Play Number")
ax.set_ylabel("Rolling EPA/Play (50-play average)")
ax.set_title("San Francisco 49ers - Rolling EPA/Play (2023)", fontweight="bold")
plt.tight_layout()
plt.savefig("rolling_epa.png", dpi=300)
plt.show()
nflfastR
tidyverse
zoo
matplotlib
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Get drive starting positions and outcomes
drive_data <- pbp %>%
filter(!is.na(drive), !is.na(yardline_100)) %>%
group_by(game_id, drive) %>%
summarize(
start_yard = first(yardline_100),
result = last(fixed_drive_result),
posteam = first(posteam),
.groups = "drop"
) %>%
filter(!is.na(result))
# Summarize by field position
field_summary <- drive_data %>%
mutate(
zone = case_when(
start_yard >= 80 ~ "Own 0-20",
start_yard >= 60 ~ "Own 20-40",
start_yard >= 40 ~ "Midfield",
start_yard >= 20 ~ "Opp 20-40",
TRUE ~ "Red Zone"
),
scored = result == "Touchdown"
) %>%
group_by(zone) %>%
summarize(
drives = n(),
tds = sum(scored),
td_rate = mean(scored) * 100,
.groups = "drop"
)
# Create bar chart
ggplot(field_summary, aes(x = reorder(zone, -td_rate), y = td_rate, fill = td_rate)) +
geom_col() +
geom_text(aes(label = paste0(round(td_rate, 1), "%")), vjust = -0.5) +
scale_fill_gradient(low = "#FFC107", high = "#28A745") +
labs(
title = "Touchdown Rate by Starting Field Position",
x = "Starting Field Position",
y = "TD Rate (%)"
) +
theme_minimal() +
theme(legend.position = "none")
ggsave("field_position.png", width = 10, height = 6)
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Get drive starting positions and outcomes
drives = pbp[(pbp["drive"].notna()) & (pbp["yardline_100"].notna())]
drive_summary = (drives.groupby(["game_id", "drive"])
.agg(
start_yard=("yardline_100", "first"),
result=("fixed_drive_result", "last")
)
.reset_index())
drive_summary = drive_summary[drive_summary["result"].notna()]
# Assign field zones
def get_zone(yard):
if yard >= 80: return "Own 0-20"
elif yard >= 60: return "Own 20-40"
elif yard >= 40: return "Midfield"
elif yard >= 20: return "Opp 20-40"
else: return "Red Zone"
drive_summary["zone"] = drive_summary["start_yard"].apply(get_zone)
drive_summary["scored"] = drive_summary["result"] == "Touchdown"
# Calculate TD rate by zone
zone_stats = (drive_summary.groupby("zone")
.agg(
drives=("scored", "count"),
tds=("scored", "sum"),
td_rate=("scored", "mean")
)
.reset_index())
zone_stats["td_rate_pct"] = zone_stats["td_rate"] * 100
zone_stats = zone_stats.sort_values("td_rate_pct", ascending=False)
# Create bar chart
fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.RdYlGn(zone_stats["td_rate"])
bars = ax.bar(zone_stats["zone"], zone_stats["td_rate_pct"], color=colors)
for bar, rate in zip(bars, zone_stats["td_rate_pct"]):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
f"{rate:.1f}%", ha="center", va="bottom")
ax.set_xlabel("Starting Field Position")
ax.set_ylabel("TD Rate (%)")
ax.set_title("Touchdown Rate by Starting Field Position", fontweight="bold")
plt.tight_layout()
plt.savefig("field_position.png", dpi=300)
plt.show()
nflfastR
ggplot2
tidyverse
matplotlib
pandas
library(nflfastR)
library(tidyverse)
library(gganimate)
library(gifski)
pbp <- load_pbp(2023)
# Calculate cumulative EPA by week
weekly_cumulative <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam, week) %>%
summarize(weekly_epa = mean(epa), .groups = "drop") %>%
arrange(posteam, week) %>%
group_by(posteam) %>%
mutate(cumulative_epa = cumsum(weekly_epa) / row_number()) %>%
ungroup()
# Add rankings
weekly_ranked <- weekly_cumulative %>%
group_by(week) %>%
mutate(rank = rank(-cumulative_epa)) %>%
ungroup()
# Create animated bar chart race
p <- ggplot(weekly_ranked,
aes(x = rank, y = cumulative_epa, fill = posteam)) +
geom_col(width = 0.8) +
geom_text(aes(label = posteam), hjust = -0.1, size = 4) +
coord_flip(clip = "off") +
scale_x_reverse() +
labs(
title = "NFL EPA Rankings Through Week {closest_state}",
x = "Rank",
y = "Cumulative EPA/Play"
) +
theme_minimal() +
theme(legend.position = "none") +
transition_states(week, transition_length = 4, state_length = 1)
# Render animation
animate(p, nframes = 200, fps = 20, width = 800, height = 600)
anim_save("epa_race.gif")
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import matplotlib.animation as animation
pbp = nfl.import_pbp_data([2023])
# Calculate cumulative EPA by week
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
weekly_epa = (plays.groupby(["posteam", "week"])["epa"]
.mean()
.reset_index()
.sort_values(["posteam", "week"]))
# Calculate cumulative average
weekly_epa["cumulative_epa"] = weekly_epa.groupby("posteam")["epa"].expanding().mean().reset_index(drop=True)
# Get rankings per week
weekly_epa["rank"] = weekly_epa.groupby("week")["cumulative_epa"].rank(ascending=False)
# Create static snapshot for a specific week (animation requires extra setup)
final_week = weekly_epa["week"].max()
final_data = weekly_epa[weekly_epa["week"] == final_week].sort_values("cumulative_epa", ascending=True)
fig, ax = plt.subplots(figsize=(10, 12))
colors = plt.cm.RdYlGn((final_data["cumulative_epa"] - final_data["cumulative_epa"].min()) /
(final_data["cumulative_epa"].max() - final_data["cumulative_epa"].min()))
bars = ax.barh(final_data["posteam"], final_data["cumulative_epa"], color=colors)
ax.set_xlabel("Cumulative EPA/Play")
ax.set_title(f"NFL EPA Rankings Through Week {final_week}", fontweight="bold")
ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
for bar, epa in zip(bars, final_data["cumulative_epa"]):
ax.text(epa + 0.005, bar.get_y() + bar.get_height()/2,
f"{epa:.3f}", va="center", fontsize=8)
plt.tight_layout()
plt.savefig("epa_rankings.png", dpi=300)
plt.show()
print("Note: For true animation, use matplotlib FuncAnimation or plotly")
nflfastR
tidyverse
gganimate
gifski
matplotlib
pandas
Deep dive into quarterback and passing game analytics
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# QB air yards analysis
qb_air_yards <- pbp %>%
filter(!is.na(passer_player_id), !is.na(air_yards)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
attempts = n(),
total_air_yards = sum(air_yards),
avg_air_yards = mean(air_yards),
deep_pct = mean(air_yards >= 20) * 100,
short_pct = mean(air_yards < 5) * 100,
.groups = "drop"
) %>%
filter(attempts >= 200) %>%
arrange(desc(avg_air_yards))
print(qb_air_yards)
# Air yards by result
pbp %>%
filter(!is.na(air_yards)) %>%
mutate(result = if_else(complete_pass == 1, "Complete", "Incomplete")) %>%
group_by(result) %>%
summarize(avg_air_yards = mean(air_yards))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# QB air yards analysis
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["air_yards"].notna())]
qb_air_yards = (passes.groupby(["passer_player_id", "passer_player_name"])
.agg(
attempts=("air_yards", "count"),
total_air_yards=("air_yards", "sum"),
avg_air_yards=("air_yards", "mean"),
deep_pct=("air_yards", lambda x: (x >= 20).mean() * 100)
)
.reset_index())
qb_air_yards = qb_air_yards[qb_air_yards["attempts"] >= 200].sort_values(
"avg_air_yards", ascending=False)
print("QB Air Yards Analysis:")
print(qb_air_yards)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# CPOE analysis
qb_cpoe <- pbp %>%
filter(!is.na(passer_player_id), !is.na(cpoe)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
attempts = n(),
completion_pct = mean(complete_pass) * 100,
exp_completion_pct = mean(cp) * 100,
cpoe = mean(cpoe),
.groups = "drop"
) %>%
filter(attempts >= 200) %>%
arrange(desc(cpoe))
print(qb_cpoe)
# CPOE by depth of target
pbp %>%
filter(!is.na(cpoe), !is.na(air_yards)) %>%
mutate(
depth = case_when(
air_yards < 0 ~ "Behind LOS",
air_yards < 10 ~ "Short (0-9)",
air_yards < 20 ~ "Medium (10-19)",
TRUE ~ "Deep (20+)"
)
) %>%
group_by(depth) %>%
summarize(
attempts = n(),
avg_cpoe = mean(cpoe),
completion_rate = mean(complete_pass)
)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# CPOE analysis
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["cpoe"].notna())]
qb_cpoe = (passes.groupby(["passer_player_id", "passer_player_name"])
.agg(
attempts=("cpoe", "count"),
completion_pct=("complete_pass", lambda x: x.mean() * 100),
cpoe=("cpoe", "mean")
)
.reset_index())
qb_cpoe = qb_cpoe[qb_cpoe["attempts"] >= 200].sort_values("cpoe", ascending=False)
print("QB CPOE Rankings:")
print(qb_cpoe)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Define depth zones
depth_analysis <- pbp %>%
filter(!is.na(air_yards), play_type == "pass") %>%
mutate(
depth_zone = case_when(
air_yards < 0 ~ "Behind LOS",
air_yards < 5 ~ "Short (0-4)",
air_yards < 10 ~ "Intermediate (5-9)",
air_yards < 15 ~ "Medium (10-14)",
air_yards < 20 ~ "Deep (15-19)",
TRUE ~ "Bomb (20+)"
)
) %>%
group_by(depth_zone) %>%
summarize(
attempts = n(),
completions = sum(complete_pass),
comp_pct = mean(complete_pass) * 100,
avg_epa = mean(epa, na.rm = TRUE),
td_rate = mean(pass_touchdown, na.rm = TRUE) * 100,
int_rate = mean(interception) * 100,
.groups = "drop"
)
print(depth_analysis)
# Team deep passing rankings
pbp %>%
filter(air_yards >= 20, !is.na(epa)) %>%
group_by(posteam) %>%
summarize(
deep_attempts = n(),
deep_comp_pct = mean(complete_pass) * 100,
deep_epa = mean(epa)
) %>%
arrange(desc(deep_epa))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter pass plays with air yards
passes = pbp[(pbp["air_yards"].notna()) & (pbp["play_type"] == "pass")]
# Depth zones
def get_depth_zone(ay):
if ay < 0: return "Behind LOS"
elif ay < 5: return "Short (0-4)"
elif ay < 10: return "Intermediate (5-9)"
elif ay < 15: return "Medium (10-14)"
elif ay < 20: return "Deep (15-19)"
else: return "Bomb (20+)"
passes["depth_zone"] = passes["air_yards"].apply(get_depth_zone)
depth_analysis = (passes.groupby("depth_zone")
.agg(
attempts=("air_yards", "count"),
comp_pct=("complete_pass", lambda x: x.mean() * 100),
avg_epa=("epa", "mean"),
td_rate=("pass_touchdown", lambda x: x.mean() * 100)
)
.reset_index())
print("Passing Efficiency by Depth:")
print(depth_analysis)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# QB performance under pressure
qb_pressure <- pbp %>%
filter(!is.na(passer_player_id), play_type == "pass") %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
dropbacks = n(),
sacks = sum(sack),
sack_rate = mean(sack) * 100,
epa_no_sack = mean(epa[sack == 0], na.rm = TRUE),
.groups = "drop"
) %>%
filter(dropbacks >= 200) %>%
arrange(sack_rate)
print(qb_pressure)
# EPA with vs without pressure (using sack as proxy)
pbp %>%
filter(play_type == "pass", !is.na(epa)) %>%
group_by(sack = as.logical(sack)) %>%
summarize(
plays = n(),
avg_epa = mean(epa)
)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# QB pressure analysis
passes = pbp[(pbp["passer_player_id"].notna()) & (pbp["play_type"] == "pass")]
qb_pressure = (passes.groupby(["passer_player_id", "passer_player_name"])
.agg(
dropbacks=("sack", "count"),
sacks=("sack", "sum"),
sack_rate=("sack", lambda x: x.mean() * 100)
)
.reset_index())
qb_pressure = qb_pressure[qb_pressure["dropbacks"] >= 200].sort_values("sack_rate")
print("QB Sack Rates (lowest is best):")
print(qb_pressure)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Create pass location data
pass_locations <- pbp %>%
filter(
play_type == "pass",
!is.na(air_yards),
!is.na(pass_location)
) %>%
mutate(
x_loc = case_when(
pass_location == "left" ~ -1,
pass_location == "middle" ~ 0,
pass_location == "right" ~ 1
)
)
# Team passing tendencies
team_tendencies <- pass_locations %>%
group_by(posteam, pass_location) %>%
summarize(
attempts = n(),
epa = mean(epa, na.rm = TRUE),
.groups = "drop"
) %>%
group_by(posteam) %>%
mutate(pct = attempts / sum(attempts) * 100)
# Visualize (sample for one team)
team_tendencies %>%
filter(posteam == "KC") %>%
ggplot(aes(x = pass_location, y = pct, fill = epa)) +
geom_col() +
scale_fill_gradient2(low = "red", mid = "white", high = "green", midpoint = 0) +
labs(title = "KC Pass Location Tendencies", y = "% of Passes") +
theme_minimal()
import nfl_data_py as nfl
import pandas as pd
import matplotlib.pyplot as plt
pbp = nfl.import_pbp_data([2023])
# Pass location analysis
passes = pbp[
(pbp["play_type"] == "pass") &
(pbp["air_yards"].notna()) &
(pbp["pass_location"].notna())
]
# Team passing tendencies
team_tendencies = (passes.groupby(["posteam", "pass_location"])
.agg(
attempts=("epa", "count"),
epa=("epa", "mean")
)
.reset_index())
# Calculate percentages
team_totals = team_tendencies.groupby("posteam")["attempts"].transform("sum")
team_tendencies["pct"] = team_tendencies["attempts"] / team_totals * 100
print("Team Pass Location Tendencies:")
print(team_tendencies[team_tendencies["posteam"] == "KC"])
nflfastR
tidyverse
ggplot2
nfl_data_py
pandas
matplotlib
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Dropback success rate (includes sacks)
qb_dropback <- pbp %>%
filter(!is.na(passer_player_id), qb_dropback == 1) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
dropbacks = n(),
success_rate = mean(success, na.rm = TRUE) * 100,
epa_per_dropback = mean(epa, na.rm = TRUE),
sack_rate = mean(sack) * 100,
scramble_rate = mean(qb_scramble, na.rm = TRUE) * 100,
.groups = "drop"
) %>%
filter(dropbacks >= 200) %>%
arrange(desc(success_rate))
print(qb_dropback)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Dropback success rate
dropbacks = pbp[(pbp["passer_player_id"].notna()) & (pbp["qb_dropback"] == 1)]
qb_dropback = (dropbacks.groupby(["passer_player_id", "passer_player_name"])
.agg(
dropbacks=("success", "count"),
success_rate=("success", lambda x: x.mean() * 100),
epa_per_dropback=("epa", "mean"),
sack_rate=("sack", lambda x: x.mean() * 100)
)
.reset_index())
qb_dropback = qb_dropback[qb_dropback["dropbacks"] >= 200].sort_values(
"success_rate", ascending=False)
print("QB Dropback Success Rate:")
print(qb_dropback)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Play action analysis
play_action <- pbp %>%
filter(play_type == "pass", !is.na(epa)) %>%
mutate(play_action = if_else(is.na(play_action), FALSE, play_action))
# Overall comparison
play_action %>%
group_by(play_action) %>%
summarize(
attempts = n(),
avg_epa = mean(epa),
success_rate = mean(success) * 100,
avg_air_yards = mean(air_yards, na.rm = TRUE)
)
# Team play action usage and effectiveness
team_pa <- play_action %>%
group_by(posteam) %>%
summarize(
total_passes = n(),
pa_rate = mean(play_action) * 100,
pa_epa = mean(epa[play_action]),
no_pa_epa = mean(epa[!play_action]),
pa_advantage = mean(epa[play_action]) - mean(epa[!play_action])
) %>%
arrange(desc(pa_advantage))
print(team_pa)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Play action analysis
passes = pbp[(pbp["play_type"] == "pass") & (pbp["epa"].notna())].copy()
passes["play_action"] = passes["play_action"].fillna(False)
# Overall comparison
pa_comparison = (passes.groupby("play_action")
.agg(
attempts=("epa", "count"),
avg_epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
print("Play Action vs Standard Passing:")
print(pa_comparison)
# Team play action effectiveness
team_pa = (passes.groupby("posteam")
.apply(lambda x: pd.Series({
"pa_rate": x["play_action"].mean() * 100,
"pa_epa": x[x["play_action"]]["epa"].mean(),
"no_pa_epa": x[~x["play_action"]]["epa"].mean()
}))
.reset_index())
team_pa["pa_advantage"] = team_pa["pa_epa"] - team_pa["no_pa_epa"]
print("\nTeam Play Action Effectiveness:")
print(team_pa.sort_values("pa_advantage", ascending=False))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# QB scramble analysis
qb_scrambles <- pbp %>%
filter(!is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
dropbacks = sum(qb_dropback, na.rm = TRUE),
scrambles = sum(qb_scramble, na.rm = TRUE),
scramble_rate = mean(qb_scramble, na.rm = TRUE) * 100,
scramble_yards = sum(yards_gained[qb_scramble == 1], na.rm = TRUE),
scramble_epa = mean(epa[qb_scramble == 1], na.rm = TRUE),
.groups = "drop"
) %>%
filter(dropbacks >= 200, scrambles >= 10) %>%
arrange(desc(scramble_rate))
print(qb_scrambles)
# Scramble success by situation
pbp %>%
filter(qb_scramble == 1) %>%
mutate(
situation = case_when(
down <= 2 & ydstogo <= 5 ~ "Short yardage",
down == 3 ~ "Third down",
down == 4 ~ "Fourth down",
TRUE ~ "Normal"
)
) %>%
group_by(situation) %>%
summarize(
scrambles = n(),
avg_yards = mean(yards_gained),
success_rate = mean(success)
)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# QB scramble analysis
qb_plays = pbp[pbp["passer_player_id"].notna()]
qb_scrambles = (qb_plays.groupby(["passer_player_id", "passer_player_name"])
.agg(
dropbacks=("qb_dropback", "sum"),
scrambles=("qb_scramble", "sum"),
scramble_rate=("qb_scramble", lambda x: x.mean() * 100)
)
.reset_index())
qb_scrambles = qb_scrambles[
(qb_scrambles["dropbacks"] >= 200) &
(qb_scrambles["scrambles"] >= 10)
].sort_values("scramble_rate", ascending=False)
print("QB Scramble Rates:")
print(qb_scrambles)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Top receivers by targets
receiver_targets <- pbp %>%
filter(!is.na(receiver_player_id), play_type == "pass") %>%
group_by(receiver_player_id, receiver_player_name, posteam) %>%
summarize(
targets = n(),
receptions = sum(complete_pass),
yards = sum(yards_gained, na.rm = TRUE),
tds = sum(pass_touchdown),
epa = sum(epa, na.rm = TRUE),
avg_depth = mean(air_yards, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(targets))
# Add target share
receiver_targets <- receiver_targets %>%
group_by(posteam) %>%
mutate(
team_targets = sum(targets),
target_share = targets / team_targets * 100
) %>%
ungroup()
print(receiver_targets %>% head(30))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Receiver target analysis
passes = pbp[(pbp["receiver_player_id"].notna()) & (pbp["play_type"] == "pass")]
receiver_targets = (passes.groupby(["receiver_player_id", "receiver_player_name", "posteam"])
.agg(
targets=("epa", "count"),
receptions=("complete_pass", "sum"),
yards=("yards_gained", "sum"),
tds=("pass_touchdown", "sum"),
epa=("epa", "sum"),
avg_depth=("air_yards", "mean")
)
.reset_index()
.sort_values("targets", ascending=False))
# Add target share
team_targets = receiver_targets.groupby("posteam")["targets"].transform("sum")
receiver_targets["target_share"] = receiver_targets["targets"] / team_targets * 100
print("Top Receivers by Targets:")
print(receiver_targets.head(30))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Receiver YAC analysis
receiver_yac <- pbp %>%
filter(complete_pass == 1, !is.na(yards_after_catch)) %>%
group_by(receiver_player_id, receiver_player_name) %>%
summarize(
receptions = n(),
total_yac = sum(yards_after_catch),
avg_yac = mean(yards_after_catch),
yac_per_target = total_yac / n(),
.groups = "drop"
) %>%
filter(receptions >= 40) %>%
arrange(desc(avg_yac))
print(receiver_yac)
# Team YAC vs Air Yards balance
team_yac <- pbp %>%
filter(complete_pass == 1, !is.na(yards_after_catch)) %>%
group_by(posteam) %>%
summarize(
completions = n(),
avg_air_yards = mean(air_yards, na.rm = TRUE),
avg_yac = mean(yards_after_catch),
yac_pct = avg_yac / (avg_air_yards + avg_yac) * 100
) %>%
arrange(desc(avg_yac))
print(team_yac)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Receiver YAC analysis
completions = pbp[(pbp["complete_pass"] == 1) & (pbp["yards_after_catch"].notna())]
receiver_yac = (completions.groupby(["receiver_player_id", "receiver_player_name"])
.agg(
receptions=("yards_after_catch", "count"),
total_yac=("yards_after_catch", "sum"),
avg_yac=("yards_after_catch", "mean")
)
.reset_index())
receiver_yac = receiver_yac[receiver_yac["receptions"] >= 40].sort_values(
"avg_yac", ascending=False)
print("Top Receivers by YAC:")
print(receiver_yac.head(20))
# Team YAC analysis
team_yac = (completions.groupby("posteam")
.agg(
avg_air_yards=("air_yards", "mean"),
avg_yac=("yards_after_catch", "mean")
)
.reset_index()
.sort_values("avg_yac", ascending=False))
print("\nTeam YAC Rankings:")
print(team_yac)
nflfastR
tidyverse
nfl_data_py
pandas
Running back and rushing game analytics
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Team rushing analysis
team_rushing <- pbp %>%
filter(play_type == "run", !is.na(epa)) %>%
group_by(posteam) %>%
summarize(
rush_attempts = n(),
avg_yards = mean(yards_gained),
rush_epa = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
arrange(desc(rush_epa))
print(team_rushing)
# RB efficiency
rb_efficiency <- pbp %>%
filter(!is.na(rusher_player_id), play_type == "run") %>%
group_by(rusher_player_id, rusher_player_name) %>%
summarize(
attempts = n(),
yards = sum(yards_gained),
ypc = mean(yards_gained),
epa = mean(epa, na.rm = TRUE),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
filter(attempts >= 100) %>%
arrange(desc(epa))
print(rb_efficiency)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Team rushing analysis
rushes = pbp[(pbp["play_type"] == "run") & (pbp["epa"].notna())]
team_rushing = (rushes.groupby("posteam")
.agg(
rush_attempts=("epa", "count"),
avg_yards=("yards_gained", "mean"),
rush_epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index()
.sort_values("rush_epa", ascending=False))
print("Team Rushing Rankings:")
print(team_rushing)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Run direction analysis
run_direction <- pbp %>%
filter(play_type == "run", !is.na(run_location), !is.na(run_gap)) %>%
group_by(run_location, run_gap) %>%
summarize(
attempts = n(),
avg_yards = mean(yards_gained),
epa = mean(epa, na.rm = TRUE),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
arrange(desc(epa))
print(run_direction)
# Team run direction tendencies
team_direction <- pbp %>%
filter(play_type == "run", !is.na(run_location)) %>%
group_by(posteam, run_location) %>%
summarize(attempts = n(), .groups = "drop") %>%
group_by(posteam) %>%
mutate(pct = attempts / sum(attempts) * 100) %>%
pivot_wider(names_from = run_location, values_from = c(attempts, pct))
print(team_direction)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Run direction analysis
rushes = pbp[(pbp["play_type"] == "run") &
(pbp["run_location"].notna()) &
(pbp["run_gap"].notna())]
run_direction = (rushes.groupby(["run_location", "run_gap"])
.agg(
attempts=("epa", "count"),
avg_yards=("yards_gained", "mean"),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index()
.sort_values("epa", ascending=False))
print("Run Efficiency by Direction:")
print(run_direction)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Box count analysis
box_analysis <- pbp %>%
filter(play_type == "run", !is.na(defenders_in_box), !is.na(epa)) %>%
group_by(defenders_in_box) %>%
summarize(
attempts = n(),
avg_yards = mean(yards_gained),
epa = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
filter(attempts >= 100)
print(box_analysis)
# Team success against stacked boxes (8+)
stacked_box <- pbp %>%
filter(play_type == "run", defenders_in_box >= 8, !is.na(epa)) %>%
group_by(posteam) %>%
summarize(
stacked_attempts = n(),
stacked_epa = mean(epa),
stacked_success = mean(success) * 100
) %>%
arrange(desc(stacked_epa))
print(stacked_box)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Box count analysis
rushes = pbp[(pbp["play_type"] == "run") &
(pbp["defenders_in_box"].notna()) &
(pbp["epa"].notna())]
box_analysis = (rushes.groupby("defenders_in_box")
.agg(
attempts=("epa", "count"),
avg_yards=("yards_gained", "mean"),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
box_analysis = box_analysis[box_analysis["attempts"] >= 100]
print("Rushing by Defenders in Box:")
print(box_analysis)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Goal line rushing (inside 5 yard line)
goal_line <- pbp %>%
filter(play_type == "run", yardline_100 <= 5, !is.na(epa))
# Overall goal line rushing
goal_line %>%
summarize(
attempts = n(),
td_rate = mean(rush_touchdown) * 100,
success_rate = mean(success) * 100,
avg_yards = mean(yards_gained)
)
# Team goal line rushing
team_goal_line <- goal_line %>%
group_by(posteam) %>%
summarize(
attempts = n(),
touchdowns = sum(rush_touchdown),
td_rate = mean(rush_touchdown) * 100,
success_rate = mean(success) * 100
) %>%
filter(attempts >= 10) %>%
arrange(desc(td_rate))
print(team_goal_line)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Goal line rushing
goal_line = pbp[(pbp["play_type"] == "run") &
(pbp["yardline_100"] <= 5) &
(pbp["epa"].notna())]
# Team goal line rushing
team_goal_line = (goal_line.groupby("posteam")
.agg(
attempts=("rush_touchdown", "count"),
touchdowns=("rush_touchdown", "sum"),
td_rate=("rush_touchdown", lambda x: x.mean() * 100),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
team_goal_line = team_goal_line[team_goal_line["attempts"] >= 10].sort_values(
"td_rate", ascending=False)
print("Goal Line Rushing Efficiency:")
print(team_goal_line)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# RB rushing stats
rb_rush <- pbp %>%
filter(!is.na(rusher_player_id), play_type == "run") %>%
group_by(rusher_player_id) %>%
summarize(
rush_att = n(),
rush_yards = sum(yards_gained),
rush_epa = sum(epa, na.rm = TRUE),
.groups = "drop"
)
# RB receiving stats
rb_rec <- pbp %>%
filter(!is.na(receiver_player_id), play_type == "pass") %>%
group_by(receiver_player_id) %>%
summarize(
targets = n(),
receptions = sum(complete_pass),
rec_yards = sum(yards_gained[complete_pass == 1], na.rm = TRUE),
rec_epa = sum(epa, na.rm = TRUE),
.groups = "drop"
)
# Combine (simple join by player ID)
rb_combined <- rb_rush %>%
inner_join(rb_rec, by = c("rusher_player_id" = "receiver_player_id")) %>%
filter(rush_att >= 50) %>%
mutate(
total_epa = rush_epa + rec_epa,
rec_epa_pct = rec_epa / total_epa * 100
) %>%
arrange(desc(total_epa))
print(rb_combined)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# RB rushing stats
rushes = pbp[(pbp["rusher_player_id"].notna()) & (pbp["play_type"] == "run")]
rb_rush = (rushes.groupby("rusher_player_id")
.agg(
rush_att=("epa", "count"),
rush_yards=("yards_gained", "sum"),
rush_epa=("epa", "sum")
)
.reset_index())
# RB receiving stats
receptions = pbp[(pbp["receiver_player_id"].notna()) & (pbp["play_type"] == "pass")]
rb_rec = (receptions.groupby("receiver_player_id")
.agg(
targets=("epa", "count"),
receptions=("complete_pass", "sum"),
rec_epa=("epa", "sum")
)
.reset_index())
# Combine
rb_combined = rb_rush.merge(rb_rec, left_on="rusher_player_id",
right_on="receiver_player_id", how="inner")
rb_combined = rb_combined[rb_combined["rush_att"] >= 50]
rb_combined["total_epa"] = rb_combined["rush_epa"] + rb_combined["rec_epa"]
print("RB Total EPA (Rush + Receiving):")
print(rb_combined.sort_values("total_epa", ascending=False).head(20))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Rush success by formation
formation_rush <- pbp %>%
filter(play_type == "run", !is.na(epa), !is.na(offense_formation)) %>%
group_by(offense_formation) %>%
summarize(
attempts = n(),
avg_yards = mean(yards_gained),
epa = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
filter(attempts >= 100) %>%
arrange(desc(epa))
print(formation_rush)
# Shotgun vs Under Center rushing
pbp %>%
filter(play_type == "run", !is.na(epa)) %>%
mutate(
shotgun = if_else(offense_formation == "SHOTGUN", "Shotgun", "Under Center")
) %>%
group_by(shotgun) %>%
summarize(
attempts = n(),
avg_yards = mean(yards_gained),
epa = mean(epa),
success_rate = mean(success) * 100
)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Rush success by formation
rushes = pbp[(pbp["play_type"] == "run") &
(pbp["epa"].notna()) &
(pbp["offense_formation"].notna())]
formation_rush = (rushes.groupby("offense_formation")
.agg(
attempts=("epa", "count"),
avg_yards=("yards_gained", "mean"),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
formation_rush = formation_rush[formation_rush["attempts"] >= 100].sort_values(
"epa", ascending=False)
print("Rushing Efficiency by Formation:")
print(formation_rush)
nflfastR
tidyverse
nfl_data_py
pandas
Analyze defensive performance, coverage, and pressure metrics
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Defensive EPA rankings
def_epa <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(defteam) %>%
summarize(
plays = n(),
epa_allowed = mean(epa),
pass_epa_allowed = mean(epa[play_type == "pass"]),
rush_epa_allowed = mean(epa[play_type == "run"]),
success_allowed = mean(success) * 100,
.groups = "drop"
) %>%
arrange(epa_allowed) # Lower is better
print(def_epa)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Defensive EPA rankings
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
def_epa = (plays.groupby("defteam")
.agg(
plays=("epa", "count"),
epa_allowed=("epa", "mean"),
success_allowed=("success", lambda x: x.mean() * 100)
)
.reset_index()
.sort_values("epa_allowed")) # Lower is better
print("Defensive EPA Rankings (lower is better):")
print(def_epa)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Team pass rush metrics
pass_rush <- pbp %>%
filter(play_type == "pass", !is.na(epa)) %>%
group_by(defteam) %>%
summarize(
pass_plays_faced = n(),
sacks = sum(sack),
sack_rate = mean(sack) * 100,
qb_hits = sum(qb_hit, na.rm = TRUE),
epa_allowed = mean(epa),
.groups = "drop"
) %>%
arrange(desc(sack_rate))
print(pass_rush)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Pass rush metrics
pass_plays = pbp[(pbp["play_type"] == "pass") & (pbp["epa"].notna())]
pass_rush = (pass_plays.groupby("defteam")
.agg(
pass_plays=("epa", "count"),
sacks=("sack", "sum"),
sack_rate=("sack", lambda x: x.mean() * 100),
epa_allowed=("epa", "mean")
)
.reset_index()
.sort_values("sack_rate", ascending=False))
print("Team Pass Rush Effectiveness:")
print(pass_rush)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Red zone defense
rz_defense <- pbp %>%
filter(yardline_100 <= 20, !is.na(epa)) %>%
group_by(defteam) %>%
summarize(
rz_plays = n(),
rz_epa_allowed = mean(epa),
td_allowed_rate = mean(touchdown, na.rm = TRUE) * 100,
success_allowed = mean(success) * 100,
.groups = "drop"
) %>%
arrange(rz_epa_allowed)
print(rz_defense)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Red zone defense
rz_plays = pbp[(pbp["yardline_100"] <= 20) & (pbp["epa"].notna())]
rz_defense = (rz_plays.groupby("defteam")
.agg(
rz_plays=("epa", "count"),
rz_epa_allowed=("epa", "mean"),
td_allowed_rate=("touchdown", lambda x: x.mean() * 100)
)
.reset_index()
.sort_values("rz_epa_allowed"))
print("Red Zone Defense Rankings:")
print(rz_defense)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Third down defense
third_down_def <- pbp %>%
filter(down == 3, !is.na(epa)) %>%
group_by(defteam) %>%
summarize(
third_downs = n(),
conversions_allowed = sum(first_down, na.rm = TRUE),
conv_rate_allowed = mean(first_down, na.rm = TRUE) * 100,
epa_allowed = mean(epa),
.groups = "drop"
) %>%
arrange(conv_rate_allowed)
print(third_down_def)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Third down defense
third_down = pbp[(pbp["down"] == 3) & (pbp["epa"].notna())]
third_down_def = (third_down.groupby("defteam")
.agg(
third_downs=("epa", "count"),
conv_rate_allowed=("first_down", lambda x: x.mean() * 100),
epa_allowed=("epa", "mean")
)
.reset_index()
.sort_values("conv_rate_allowed"))
print("Third Down Defense (lower is better):")
print(third_down_def)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Turnover analysis by defense
turnovers <- pbp %>%
filter(!is.na(defteam)) %>%
group_by(defteam) %>%
summarize(
plays = n(),
interceptions = sum(interception, na.rm = TRUE),
fumbles_forced = sum(fumble_lost, na.rm = TRUE),
total_turnovers = interceptions + fumbles_forced,
turnover_rate = total_turnovers / plays * 100,
.groups = "drop"
) %>%
arrange(desc(total_turnovers))
print(turnovers)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Turnover analysis
plays = pbp[pbp["defteam"].notna()]
turnovers = (plays.groupby("defteam")
.agg(
plays=("epa", "count"),
interceptions=("interception", "sum"),
fumbles=("fumble_lost", "sum")
)
.reset_index())
turnovers["total_turnovers"] = turnovers["interceptions"] + turnovers["fumbles"]
turnovers["turnover_rate"] = turnovers["total_turnovers"] / turnovers["plays"] * 100
turnovers = turnovers.sort_values("total_turnovers", ascending=False)
print("Defensive Turnovers Created:")
print(turnovers)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# YAC allowed analysis
yac_allowed <- pbp %>%
filter(complete_pass == 1, !is.na(yards_after_catch)) %>%
group_by(defteam) %>%
summarize(
completions_allowed = n(),
total_yac_allowed = sum(yards_after_catch),
avg_yac_allowed = mean(yards_after_catch),
.groups = "drop"
) %>%
arrange(avg_yac_allowed)
print(yac_allowed)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# YAC allowed
completions = pbp[(pbp["complete_pass"] == 1) & (pbp["yards_after_catch"].notna())]
yac_allowed = (completions.groupby("defteam")
.agg(
completions=("yards_after_catch", "count"),
total_yac=("yards_after_catch", "sum"),
avg_yac=("yards_after_catch", "mean")
)
.reset_index()
.sort_values("avg_yac"))
print("YAC Allowed by Defense (lower is better):")
print(yac_allowed)
nflfastR
tidyverse
nfl_data_py
pandas
Analyze kicking, punting, and return game performance
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Field goal analysis
fg_analysis <- pbp %>%
filter(play_type == "field_goal", !is.na(kick_distance)) %>%
mutate(
distance_bucket = case_when(
kick_distance < 30 ~ "Under 30",
kick_distance < 40 ~ "30-39",
kick_distance < 50 ~ "40-49",
TRUE ~ "50+"
)
) %>%
group_by(distance_bucket) %>%
summarize(
attempts = n(),
makes = sum(field_goal_result == "made"),
pct = mean(field_goal_result == "made") * 100,
.groups = "drop"
)
print(fg_analysis)
# Team kicking rankings
team_fg <- pbp %>%
filter(play_type == "field_goal") %>%
group_by(posteam) %>%
summarize(
attempts = n(),
makes = sum(field_goal_result == "made"),
pct = mean(field_goal_result == "made") * 100,
avg_distance = mean(kick_distance, na.rm = TRUE)
) %>%
arrange(desc(pct))
print(team_fg)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Field goal analysis
fgs = pbp[(pbp["play_type"] == "field_goal") & (pbp["kick_distance"].notna())]
def distance_bucket(d):
if d < 30: return "Under 30"
elif d < 40: return "30-39"
elif d < 50: return "40-49"
else: return "50+"
fgs["distance_bucket"] = fgs["kick_distance"].apply(distance_bucket)
fg_analysis = (fgs.groupby("distance_bucket")
.agg(
attempts=("field_goal_result", "count"),
makes=("field_goal_result", lambda x: (x == "made").sum()),
pct=("field_goal_result", lambda x: (x == "made").mean() * 100)
)
.reset_index())
print("Field Goal Accuracy by Distance:")
print(fg_analysis)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Punter analysis
punt_analysis <- pbp %>%
filter(play_type == "punt", !is.na(kick_distance)) %>%
group_by(punter_player_name) %>%
summarize(
punts = n(),
avg_gross = mean(kick_distance),
inside_20 = sum(yardline_100 <= 20, na.rm = TRUE),
touchbacks = sum(touchback, na.rm = TRUE),
.groups = "drop"
) %>%
filter(punts >= 20) %>%
arrange(desc(avg_gross))
print(punt_analysis)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Punt analysis
punts = pbp[(pbp["play_type"] == "punt") & (pbp["kick_distance"].notna())]
punt_analysis = (punts.groupby("punter_player_name")
.agg(
punts=("kick_distance", "count"),
avg_gross=("kick_distance", "mean"),
touchbacks=("touchback", "sum")
)
.reset_index())
punt_analysis = punt_analysis[punt_analysis["punts"] >= 20].sort_values(
"avg_gross", ascending=False)
print("Punter Rankings:")
print(punt_analysis)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Kickoff return analysis
ko_returns <- pbp %>%
filter(play_type == "kickoff", !is.na(return_yards)) %>%
group_by(posteam) %>%
summarize(
returns = n(),
total_yards = sum(return_yards),
avg_return = mean(return_yards),
touchbacks = sum(touchback, na.rm = TRUE),
return_tds = sum(return_touchdown, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(avg_return))
print(ko_returns)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Kickoff return analysis
kickoffs = pbp[(pbp["play_type"] == "kickoff") & (pbp["return_yards"].notna())]
ko_returns = (kickoffs.groupby("posteam")
.agg(
returns=("return_yards", "count"),
total_yards=("return_yards", "sum"),
avg_return=("return_yards", "mean"),
tds=("return_touchdown", "sum")
)
.reset_index()
.sort_values("avg_return", ascending=False))
print("Kickoff Return Rankings:")
print(ko_returns)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Punt return analysis
punt_returns <- pbp %>%
filter(play_type == "punt", !is.na(return_yards), return_yards > 0) %>%
group_by(posteam) %>%
summarize(
returns = n(),
total_yards = sum(return_yards),
avg_return = mean(return_yards),
return_tds = sum(return_touchdown, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(avg_return))
print(punt_returns)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Punt return analysis
punt_returns = pbp[(pbp["play_type"] == "punt") &
(pbp["return_yards"].notna()) &
(pbp["return_yards"] > 0)]
pr_analysis = (punt_returns.groupby("posteam")
.agg(
returns=("return_yards", "count"),
total_yards=("return_yards", "sum"),
avg_return=("return_yards", "mean")
)
.reset_index()
.sort_values("avg_return", ascending=False))
print("Punt Return Rankings:")
print(pr_analysis)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Special teams EPA by play type
st_epa <- pbp %>%
filter(special_teams_play == 1, !is.na(epa)) %>%
mutate(
st_play_type = case_when(
play_type == "kickoff" ~ "Kickoff",
play_type == "punt" ~ "Punt",
play_type == "field_goal" ~ "Field Goal",
play_type == "extra_point" ~ "Extra Point",
TRUE ~ "Other"
)
) %>%
group_by(posteam, st_play_type) %>%
summarize(
plays = n(),
total_epa = sum(epa),
avg_epa = mean(epa),
.groups = "drop"
)
# Team special teams rankings
team_st_epa <- pbp %>%
filter(special_teams_play == 1, !is.na(epa)) %>%
group_by(posteam) %>%
summarize(
total_st_epa = sum(epa),
avg_st_epa = mean(epa),
st_plays = n()
) %>%
arrange(desc(total_st_epa))
print(team_st_epa)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter special teams plays
st_plays = pbp[(pbp["special_teams_play"] == 1) & (pbp["epa"].notna())]
# Map play types
def st_type(play):
if play == "kickoff": return "Kickoff"
elif play == "punt": return "Punt"
elif play == "field_goal": return "Field Goal"
elif play == "extra_point": return "Extra Point"
else: return "Other"
st_plays["st_play_type"] = st_plays["play_type"].apply(st_type)
# Team special teams EPA
team_st_epa = (st_plays.groupby("posteam")
.agg(
total_st_epa=("epa", "sum"),
avg_st_epa=("epa", "mean"),
st_plays=("epa", "count")
)
.reset_index()
.sort_values("total_st_epa", ascending=False))
print("Special Teams EPA Rankings:")
print(team_st_epa)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2020:2023)
# Find blocked kicks
blocked_kicks <- pbp %>%
filter(
play_type %in% c("field_goal", "punt", "extra_point"),
str_detect(tolower(desc), "block")
) %>%
mutate(
blocked_type = play_type
)
# Blocked kicks by team (blocking team)
team_blocks <- blocked_kicks %>%
group_by(defteam, blocked_type) %>%
summarize(blocks = n(), .groups = "drop") %>%
pivot_wider(names_from = blocked_type, values_from = blocks, values_fill = 0) %>%
mutate(total_blocks = rowSums(across(where(is.numeric)))) %>%
arrange(desc(total_blocks))
print("Teams with Most Blocked Kicks (2020-2023):")
print(team_blocks)
# Blocked kicks allowed
blocks_allowed <- blocked_kicks %>%
group_by(posteam, blocked_type) %>%
summarize(blocked = n(), .groups = "drop") %>%
pivot_wider(names_from = blocked_type, values_from = blocked, values_fill = 0) %>%
mutate(total_blocked = rowSums(across(where(is.numeric)))) %>%
arrange(desc(total_blocked))
print("\nTeams with Most Kicks Blocked Against:")
print(blocks_allowed)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
# Find blocked kicks
st_plays = pbp[pbp["play_type"].isin(["field_goal", "punt", "extra_point"])]
blocked = st_plays[st_plays["desc"].str.lower().str.contains("block", na=False)]
# Blocked kicks by blocking team
team_blocks = (blocked.groupby(["defteam", "play_type"])
.size()
.reset_index(name="blocks"))
# Pivot to get blocks by type
blocks_pivot = team_blocks.pivot_table(
index="defteam",
columns="play_type",
values="blocks",
fill_value=0
).reset_index()
blocks_pivot["total_blocks"] = blocks_pivot.select_dtypes(include="number").sum(axis=1)
blocks_pivot = blocks_pivot.sort_values("total_blocks", ascending=False)
print("Teams with Most Blocked Kicks (2020-2023):")
print(blocks_pivot.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
Build predictive models for spreads, totals, and player props
library(nflfastR)
library(tidyverse)
# Load data
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate team ratings
team_ratings <- pbp %>%
filter(!is.na(epa)) %>%
group_by(posteam) %>%
summarize(off_epa = mean(epa)) %>%
left_join(
pbp %>%
filter(!is.na(epa)) %>%
group_by(defteam) %>%
summarize(def_epa = mean(epa)),
by = c("posteam" = "defteam")
) %>%
mutate(
net_epa = off_epa - def_epa,
power_rating = net_epa * 3.5 # Convert to points
)
# Create matchup predictions
games <- schedules %>%
filter(!is.na(result)) %>%
left_join(team_ratings, by = c("home_team" = "posteam")) %>%
rename(home_power = power_rating) %>%
left_join(team_ratings %>% select(posteam, power_rating),
by = c("away_team" = "posteam")) %>%
rename(away_power = power_rating) %>%
mutate(
pred_spread = away_power - home_power - 2.5, # HFA
actual_spread = -result,
error = pred_spread - actual_spread,
correct_side = (pred_spread > 0 & result < 0) |
(pred_spread < 0 & result > 0)
)
# Model performance
cat("Mean Absolute Error:", mean(abs(games$error)), "\n")
cat("Correct Side %:", mean(games$correct_side) * 100, "%\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
# Load data
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate team ratings
plays = pbp[pbp["epa"].notna()]
off_epa = plays.groupby("posteam")["epa"].mean().reset_index()
off_epa.columns = ["team", "off_epa"]
def_epa = plays.groupby("defteam")["epa"].mean().reset_index()
def_epa.columns = ["team", "def_epa"]
team_ratings = off_epa.merge(def_epa, on="team")
team_ratings["net_epa"] = team_ratings["off_epa"] - team_ratings["def_epa"]
team_ratings["power_rating"] = team_ratings["net_epa"] * 3.5
# Create matchup predictions
games = schedules[schedules["result"].notna()].copy()
games = games.merge(
team_ratings[["team", "power_rating"]],
left_on="home_team", right_on="team"
).rename(columns={"power_rating": "home_power"})
games = games.merge(
team_ratings[["team", "power_rating"]],
left_on="away_team", right_on="team"
).rename(columns={"power_rating": "away_power"})
games["pred_spread"] = games["away_power"] - games["home_power"] - 2.5
games["actual_spread"] = -games["result"]
games["error"] = games["pred_spread"] - games["actual_spread"]
games["correct_side"] = ((games["pred_spread"] > 0) & (games["result"] < 0)) | \
((games["pred_spread"] < 0) & (games["result"] > 0))
print(f"Mean Absolute Error: {games['error'].abs().mean():.2f}")
print(f"Correct Side %: {games['correct_side'].mean() * 100:.1f}%")
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate team scoring metrics
team_scoring <- pbp %>%
filter(!is.na(epa)) %>%
group_by(game_id, posteam) %>%
summarize(
plays = n(),
points_scored = sum(touchdown * 7, na.rm = TRUE) +
sum(field_goal_result == "made" * 3, na.rm = TRUE),
.groups = "drop"
) %>%
group_by(posteam) %>%
summarize(
avg_plays = mean(plays),
avg_points = mean(points_scored),
pace = avg_plays / 60 # plays per minute proxy
)
# Create totals predictions
games <- schedules %>%
filter(!is.na(result), !is.na(total)) %>%
left_join(team_scoring, by = c("home_team" = "posteam")) %>%
rename(home_points = avg_points, home_pace = pace) %>%
left_join(team_scoring %>% select(posteam, avg_points, pace),
by = c("away_team" = "posteam")) %>%
rename(away_points = avg_points, away_pace = pace) %>%
mutate(
pred_total = (home_points + away_points) *
((home_pace + away_pace) / 2),
actual_total = home_score + away_score,
over_under = if_else(actual_total > total, "Over", "Under"),
pred_over = pred_total > total,
actual_over = actual_total > total,
correct = pred_over == actual_over
)
# Model performance
cat("Correct %:", mean(games$correct) * 100, "%\n")
cat("MAE:", mean(abs(games$pred_total - games$actual_total)), "\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate team scoring metrics
plays = pbp[pbp["epa"].notna()]
game_scoring = (plays.groupby(["game_id", "posteam"])
.agg(
plays=("epa", "count"),
touchdowns=("touchdown", "sum")
)
.reset_index())
game_scoring["points_approx"] = game_scoring["touchdowns"] * 7
team_scoring = (game_scoring.groupby("posteam")
.agg(
avg_plays=("plays", "mean"),
avg_points=("points_approx", "mean")
)
.reset_index())
team_scoring["pace"] = team_scoring["avg_plays"] / 60
# Create totals predictions
games = schedules[schedules["result"].notna() & schedules["total"].notna()].copy()
games = games.merge(
team_scoring[["posteam", "avg_points", "pace"]],
left_on="home_team", right_on="posteam"
).rename(columns={"avg_points": "home_points", "pace": "home_pace"})
games = games.merge(
team_scoring[["posteam", "avg_points", "pace"]],
left_on="away_team", right_on="posteam"
).rename(columns={"avg_points": "away_points", "pace": "away_pace"})
games["pred_total"] = (games["home_points"] + games["away_points"]) * \
((games["home_pace"] + games["away_pace"]) / 2)
games["actual_total"] = games["home_score"] + games["away_score"]
games["correct"] = (games["pred_total"] > games["total"]) == \
(games["actual_total"] > games["total"])
print(f"Correct %: {games['correct'].mean() * 100:.1f}%")
print(f"MAE: {(games['pred_total'] - games['actual_total']).abs().mean():.1f}")
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate receiver baseline stats
receiver_stats <- pbp %>%
filter(!is.na(receiver_player_id), play_type == "pass") %>%
group_by(receiver_player_id, receiver_player_name, posteam) %>%
summarize(
targets = n(),
receptions = sum(complete_pass),
yards = sum(yards_gained, na.rm = TRUE),
tds = sum(touchdown),
avg_depth = mean(air_yards, na.rm = TRUE),
.groups = "drop"
) %>%
filter(targets >= 50) %>%
mutate(
catch_rate = receptions / targets,
yards_per_target = yards / targets,
yards_per_reception = yards / receptions
)
# Calculate team target rates for each receiver
team_targets <- pbp %>%
filter(play_type == "pass", !is.na(receiver_player_id)) %>%
group_by(posteam) %>%
summarize(team_targets = n())
receiver_share <- receiver_stats %>%
left_join(team_targets, by = "posteam") %>%
mutate(target_share = targets / team_targets)
# Project yards for next game
# Assume team throws ~35 passes per game
receiver_share <- receiver_share %>%
mutate(
proj_targets = target_share * 35,
proj_yards = proj_targets * yards_per_target
) %>%
arrange(desc(proj_yards))
print(receiver_share %>%
select(receiver_player_name, target_share,
yards_per_target, proj_yards) %>%
head(20))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Calculate receiver baseline stats
pass_plays = pbp[(pbp["receiver_player_id"].notna()) &
(pbp["play_type"] == "pass")]
receiver_stats = (pass_plays.groupby(
["receiver_player_id", "receiver_player_name", "posteam"])
.agg(
targets=("play_id", "count"),
receptions=("complete_pass", "sum"),
yards=("yards_gained", "sum"),
tds=("touchdown", "sum"),
avg_depth=("air_yards", "mean")
)
.reset_index())
receiver_stats = receiver_stats[receiver_stats["targets"] >= 50]
receiver_stats["catch_rate"] = receiver_stats["receptions"] / receiver_stats["targets"]
receiver_stats["yards_per_target"] = receiver_stats["yards"] / receiver_stats["targets"]
# Calculate team target rates
team_targets = (pass_plays.groupby("posteam")
.size().reset_index(name="team_targets"))
receiver_share = receiver_stats.merge(team_targets, on="posteam")
receiver_share["target_share"] = receiver_share["targets"] / receiver_share["team_targets"]
# Project yards (35 passes per game assumption)
receiver_share["proj_targets"] = receiver_share["target_share"] * 35
receiver_share["proj_yards"] = receiver_share["proj_targets"] * receiver_share["yards_per_target"]
result = receiver_share.nlargest(20, "proj_yards")[
["receiver_player_name", "target_share", "yards_per_target", "proj_yards"]
]
print(result)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2020:2023)
# Initialize Elo ratings
elo <- setNames(rep(1500, 32),
unique(c(schedules$home_team, schedules$away_team)))
k_factor <- 20
# Calculate expected score
expected <- function(r1, r2) 1 / (1 + 10^((r2 - r1)/400))
# Elo calculation function
update_elo <- function(winner_rating, loser_rating, margin) {
exp_win <- expected(winner_rating, loser_rating)
mov_mult <- log(abs(margin) + 1) * 2.2 / ((winner_rating - loser_rating) * 0.001 + 2.2)
change <- k_factor * mov_mult * (1 - exp_win)
return(change)
}
# Process games
games <- schedules %>%
filter(!is.na(result)) %>%
arrange(game_id)
elo_history <- list()
for (i in seq_len(nrow(games))) {
g <- games[i,]
home_elo <- elo[g$home_team] + 55 # HFA
away_elo <- elo[g$away_team]
if (g$result > 0) { # Home win
change <- update_elo(home_elo, away_elo, g$result)
elo[g$home_team] <- elo[g$home_team] + change
elo[g$away_team] <- elo[g$away_team] - change
} else { # Away win
change <- update_elo(away_elo, home_elo, -g$result)
elo[g$away_team] <- elo[g$away_team] + change
elo[g$home_team] <- elo[g$home_team] - change
}
}
# Current Elo rankings
elo_df <- data.frame(team = names(elo), elo = elo) %>%
arrange(desc(elo))
print(elo_df)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])
# Initialize Elo ratings
teams = pd.concat([schedules["home_team"], schedules["away_team"]]).unique()
elo = {team: 1500 for team in teams}
k_factor = 20
def expected(r1, r2):
return 1 / (1 + 10**((r2 - r1)/400))
def update_elo(winner_rating, loser_rating, margin):
exp_win = expected(winner_rating, loser_rating)
mov_mult = np.log(abs(margin) + 1) * 2.2 / ((winner_rating - loser_rating) * 0.001 + 2.2)
change = k_factor * mov_mult * (1 - exp_win)
return change
# Process games
games = schedules[schedules["result"].notna()].sort_values("game_id")
for _, g in games.iterrows():
home_elo = elo[g["home_team"]] + 55 # Home field advantage
away_elo = elo[g["away_team"]]
if g["result"] > 0: # Home win
change = update_elo(home_elo, away_elo, g["result"])
elo[g["home_team"]] += change
elo[g["away_team"]] -= change
else: # Away win
change = update_elo(away_elo, home_elo, -g["result"])
elo[g["away_team"]] += change
elo[g["home_team"]] -= change
# Current Elo rankings
elo_df = pd.DataFrame({"team": list(elo.keys()), "elo": list(elo.values())})
elo_df = elo_df.sort_values("elo", ascending=False).reset_index(drop=True)
print("Current Elo Rankings:")
print(elo_df)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate multiple components
team_metrics <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(
off_epa = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
left_join(
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(defteam) %>%
summarize(
def_epa = mean(epa),
def_success_allowed = mean(success) * 100
),
by = c("posteam" = "defteam")
)
# Get win-loss record
records <- schedules %>%
filter(!is.na(result)) %>%
pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
mutate(
win = (location == "home_team" & result > 0) | (location == "away_team" & result < 0)
) %>%
group_by(team) %>%
summarize(wins = sum(win), games = n(), win_pct = wins/games)
# Combine into power ranking
power_rankings <- team_metrics %>%
left_join(records, by = c("posteam" = "team")) %>%
mutate(
# Normalize each metric to 0-100
off_score = (off_epa - min(off_epa)) / (max(off_epa) - min(off_epa)) * 100,
def_score = 100 - (def_epa - min(def_epa)) / (max(def_epa) - min(def_epa)) * 100,
win_score = win_pct * 100,
# Composite (40% offense, 40% defense, 20% record)
power_rating = off_score * 0.4 + def_score * 0.4 + win_score * 0.2
) %>%
arrange(desc(power_rating)) %>%
mutate(rank = row_number())
print(power_rankings %>% select(posteam, rank, power_rating, off_epa, def_epa, win_pct))
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
off_stats = plays.groupby("posteam").agg(
off_epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
).reset_index()
def_stats = plays.groupby("defteam").agg(
def_epa=("epa", "mean")
).reset_index()
team_metrics = off_stats.merge(def_stats, left_on="posteam", right_on="defteam")
# Get win-loss records
games = schedules[schedules["result"].notna()]
home_wins = games[games["result"] > 0].groupby("home_team").size()
away_wins = games[games["result"] < 0].groupby("away_team").size()
total_games = games.groupby("home_team").size() + games.groupby("away_team").size()
total_wins = home_wins.add(away_wins, fill_value=0)
win_pct = (total_wins / total_games).reset_index()
win_pct.columns = ["team", "win_pct"]
# Combine
power_rankings = team_metrics.merge(win_pct, left_on="posteam", right_on="team")
# Normalize
def normalize(col):
return (col - col.min()) / (col.max() - col.min()) * 100
power_rankings["off_score"] = normalize(power_rankings["off_epa"])
power_rankings["def_score"] = 100 - normalize(power_rankings["def_epa"])
power_rankings["win_score"] = power_rankings["win_pct"] * 100
power_rankings["power_rating"] = (power_rankings["off_score"] * 0.4 +
power_rankings["def_score"] * 0.4 +
power_rankings["win_score"] * 0.2)
power_rankings = power_rankings.sort_values("power_rating", ascending=False).reset_index(drop=True)
power_rankings["rank"] = range(1, len(power_rankings) + 1)
print(power_rankings[["posteam", "rank", "power_rating", "off_epa", "def_epa", "win_pct"]])
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2019:2023)
# Overall HFA
hfa_overall <- schedules %>%
filter(!is.na(result)) %>%
summarize(
games = n(),
home_wins = sum(result > 0),
ties = sum(result == 0),
away_wins = sum(result < 0),
home_win_pct = mean(result > 0) * 100,
avg_home_margin = mean(result)
)
print("Overall Home Field Advantage:")
print(hfa_overall)
# HFA by team
team_hfa <- schedules %>%
filter(!is.na(result)) %>%
group_by(home_team) %>%
summarize(
home_games = n(),
home_wins = sum(result > 0),
home_win_pct = mean(result > 0) * 100,
avg_margin = mean(result),
.groups = "drop"
) %>%
arrange(desc(home_win_pct))
print("\nHFA by Stadium:")
print(team_hfa)
# HFA by season (trend over time)
hfa_by_season <- schedules %>%
filter(!is.na(result)) %>%
group_by(season) %>%
summarize(
home_win_pct = mean(result > 0) * 100,
avg_margin = mean(result)
)
print("\nHFA Trend by Season:")
print(hfa_by_season)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2019, 2020, 2021, 2022, 2023])
# Overall HFA
games = schedules[schedules["result"].notna()]
hfa_overall = {
"games": len(games),
"home_wins": (games["result"] > 0).sum(),
"away_wins": (games["result"] < 0).sum(),
"home_win_pct": (games["result"] > 0).mean() * 100,
"avg_home_margin": games["result"].mean()
}
print("Overall Home Field Advantage:")
print(pd.DataFrame([hfa_overall]))
# HFA by team
team_hfa = (games.groupby("home_team")
.agg(
home_games=("result", "count"),
home_wins=("result", lambda x: (x > 0).sum()),
home_win_pct=("result", lambda x: (x > 0).mean() * 100),
avg_margin=("result", "mean")
)
.reset_index()
.sort_values("home_win_pct", ascending=False))
print("\nHFA by Stadium:")
print(team_hfa)
# HFA by season
hfa_by_season = (games.groupby("season")
.agg(
home_win_pct=("result", lambda x: (x > 0).mean() * 100),
avg_margin=("result", "mean")
)
.reset_index())
print("\nHFA Trend by Season:")
print(hfa_by_season)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2018:2023)
# Calculate margin distribution
margins <- schedules %>%
filter(!is.na(result)) %>%
mutate(margin = abs(result)) %>%
group_by(margin) %>%
summarize(games = n()) %>%
mutate(pct = games / sum(games) * 100) %>%
arrange(desc(games))
# Top key numbers
key_numbers <- margins %>%
head(15) %>%
mutate(cumulative_pct = cumsum(pct))
print("Top 15 Final Margins:")
print(key_numbers)
# Games landing on 3 and 7
on_3_or_7 <- schedules %>%
filter(!is.na(result)) %>%
mutate(
on_3 = abs(result) == 3,
on_7 = abs(result) == 7,
on_key = abs(result) %in% c(3, 7, 10, 14)
) %>%
summarize(
total_games = n(),
on_3 = sum(on_3),
on_7 = sum(on_7),
on_3_pct = mean(on_3) * 100,
on_7_pct = mean(on_7) * 100,
any_key_pct = mean(on_key) * 100
)
print("\nKey Number Summary:")
print(on_3_or_7)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2018, 2019, 2020, 2021, 2022, 2023])
# Calculate margin distribution
games = schedules[schedules["result"].notna()].copy()
games["margin"] = games["result"].abs()
margins = (games.groupby("margin")
.size()
.reset_index(name="games")
.sort_values("games", ascending=False))
margins["pct"] = margins["games"] / margins["games"].sum() * 100
margins["cumulative_pct"] = margins["pct"].cumsum()
print("Top 15 Final Margins:")
print(margins.head(15))
# Key number analysis
games["on_3"] = games["margin"] == 3
games["on_7"] = games["margin"] == 7
games["on_key"] = games["margin"].isin([3, 7, 10, 14])
key_summary = {
"total_games": len(games),
"on_3": games["on_3"].sum(),
"on_7": games["on_7"].sum(),
"on_3_pct": games["on_3"].mean() * 100,
"on_7_pct": games["on_7"].mean() * 100,
"any_key_pct": games["on_key"].mean() * 100
}
print("\nKey Number Summary:")
print(pd.DataFrame([key_summary]))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Analyze spread movement patterns
spread_analysis <- schedules %>%
filter(!is.na(result), !is.na(spread_line)) %>%
mutate(
# Positive result means home win
home_covered = result + spread_line > 0,
favorite = if_else(spread_line < 0, "Home", "Away"),
spread_bucket = case_when(
abs(spread_line) <= 3 ~ "Pick/FG",
abs(spread_line) <= 7 ~ "4-7 pts",
abs(spread_line) <= 10 ~ "8-10 pts",
TRUE ~ "10+ pts"
)
)
# ATS performance by spread size
ats_by_spread <- spread_analysis %>%
group_by(spread_bucket) %>%
summarize(
games = n(),
home_cover_pct = mean(home_covered) * 100,
avg_margin = mean(result + spread_line),
.groups = "drop"
)
print("ATS Performance by Spread Size:")
print(ats_by_spread)
# Favorite vs underdog ATS
fav_vs_dog <- spread_analysis %>%
mutate(
favorite_covered = (favorite == "Home" & home_covered) |
(favorite == "Away" & !home_covered)
) %>%
summarize(
games = n(),
favorite_cover_pct = mean(favorite_covered) * 100,
dog_cover_pct = (1 - mean(favorite_covered)) * 100
)
print("\nFavorite vs Underdog ATS:")
print(fav_vs_dog)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
schedules = nfl.import_schedules([2023])
# Analyze spread patterns
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games["home_covered"] = games["result"] + games["spread_line"] > 0
games["favorite"] = np.where(games["spread_line"] < 0, "Home", "Away")
def spread_bucket(s):
s = abs(s)
if s <= 3: return "Pick/FG"
elif s <= 7: return "4-7 pts"
elif s <= 10: return "8-10 pts"
else: return "10+ pts"
games["spread_bucket"] = games["spread_line"].apply(spread_bucket)
# ATS performance by spread size
ats_by_spread = (games.groupby("spread_bucket")
.agg(
games=("home_covered", "count"),
home_cover_pct=("home_covered", lambda x: x.mean() * 100),
avg_margin=("result", lambda x: (x + games.loc[x.index, "spread_line"]).mean())
)
.reset_index())
print("ATS Performance by Spread Size:")
print(ats_by_spread)
# Favorite vs underdog
games["favorite_covered"] = ((games["favorite"] == "Home") & games["home_covered"]) | \
((games["favorite"] == "Away") & ~games["home_covered"])
fav_cover_pct = games["favorite_covered"].mean() * 100
print(f"\nFavorite cover %: {fav_cover_pct:.1f}%")
print(f"Underdog cover %: {100 - fav_cover_pct:.1f}%")
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate first half scores
first_half <- pbp %>%
filter(qtr <= 2) %>%
group_by(game_id) %>%
summarize(
home_1h = sum(home_score[play_id == max(play_id)]),
away_1h = sum(away_score[play_id == max(play_id)]),
.groups = "drop"
)
# Get final scores
final_scores <- schedules %>%
filter(!is.na(result)) %>%
select(game_id, home_score, away_score, spread_line, result)
# Combine
game_halves <- final_scores %>%
left_join(first_half, by = "game_id") %>%
mutate(
home_2h = home_score - home_1h,
away_2h = away_score - away_1h,
result_1h = home_1h - away_1h,
result_2h = home_2h - away_2h,
# Compare halves
home_stronger_2h = result_2h > result_1h,
halves_same_winner = (result_1h > 0) == (result > 0)
)
# Analysis
half_analysis <- game_halves %>%
summarize(
games = n(),
avg_1h_margin = mean(result_1h),
avg_2h_margin = mean(result_2h),
same_winner_pct = mean(halves_same_winner, na.rm = TRUE) * 100,
home_stronger_2h_pct = mean(home_stronger_2h, na.rm = TRUE) * 100
)
print("First Half vs Second Half Analysis:")
print(half_analysis)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate first half scores
first_half = (pbp[pbp["qtr"] <= 2]
.groupby("game_id")
.agg(
home_1h=("home_score", "last"),
away_1h=("away_score", "last")
)
.reset_index())
# Get final scores
final_scores = schedules[schedules["result"].notna()][
["game_id", "home_score", "away_score", "spread_line", "result"]
]
# Combine
game_halves = final_scores.merge(first_half, on="game_id")
game_halves["home_2h"] = game_halves["home_score"] - game_halves["home_1h"]
game_halves["away_2h"] = game_halves["away_score"] - game_halves["away_1h"]
game_halves["result_1h"] = game_halves["home_1h"] - game_halves["away_1h"]
game_halves["result_2h"] = game_halves["home_2h"] - game_halves["away_2h"]
game_halves["same_winner"] = (game_halves["result_1h"] > 0) == (game_halves["result"] > 0)
game_halves["home_stronger_2h"] = game_halves["result_2h"] > game_halves["result_1h"]
# Analysis
half_analysis = {
"games": len(game_halves),
"avg_1h_margin": game_halves["result_1h"].mean(),
"avg_2h_margin": game_halves["result_2h"].mean(),
"same_winner_pct": game_halves["same_winner"].mean() * 100,
"home_stronger_2h_pct": game_halves["home_stronger_2h"].mean() * 100
}
print("First Half vs Second Half Analysis:")
print(pd.DataFrame([half_analysis]))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2019:2023)
# Define divisions
divisions <- list(
AFC_East = c("BUF", "MIA", "NE", "NYJ"),
AFC_North = c("BAL", "CIN", "CLE", "PIT"),
AFC_South = c("HOU", "IND", "JAX", "TEN"),
AFC_West = c("DEN", "KC", "LV", "LAC"),
NFC_East = c("DAL", "NYG", "PHI", "WAS"),
NFC_North = c("CHI", "DET", "GB", "MIN"),
NFC_South = c("ATL", "CAR", "NO", "TB"),
NFC_West = c("ARI", "LAR", "SF", "SEA")
)
# Function to find division
get_division <- function(team) {
for (div in names(divisions)) {
if (team %in% divisions[[div]]) return(div)
}
return(NA)
}
# Analyze division games
div_games <- schedules %>%
filter(!is.na(result), !is.na(spread_line)) %>%
rowwise() %>%
mutate(
home_div = get_division(home_team),
away_div = get_division(away_team),
is_division_game = home_div == away_div
) %>%
ungroup()
# Compare division vs non-division
div_comparison <- div_games %>%
group_by(is_division_game) %>%
summarize(
games = n(),
home_win_pct = mean(result > 0) * 100,
home_cover_pct = mean(result + spread_line > 0) * 100,
avg_total = mean(home_score + away_score),
avg_margin = mean(abs(result)),
.groups = "drop"
)
print("Division vs Non-Division Game Analysis:")
print(div_comparison)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2019, 2020, 2021, 2022, 2023])
# Define divisions
divisions = {
"AFC_East": ["BUF", "MIA", "NE", "NYJ"],
"AFC_North": ["BAL", "CIN", "CLE", "PIT"],
"AFC_South": ["HOU", "IND", "JAX", "TEN"],
"AFC_West": ["DEN", "KC", "LV", "LAC"],
"NFC_East": ["DAL", "NYG", "PHI", "WAS"],
"NFC_North": ["CHI", "DET", "GB", "MIN"],
"NFC_South": ["ATL", "CAR", "NO", "TB"],
"NFC_West": ["ARI", "LAR", "SF", "SEA"]
}
def get_division(team):
for div, teams in divisions.items():
if team in teams:
return div
return None
# Analyze division games
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games["home_div"] = games["home_team"].apply(get_division)
games["away_div"] = games["away_team"].apply(get_division)
games["is_division_game"] = games["home_div"] == games["away_div"]
# Compare division vs non-division
div_comparison = (games.groupby("is_division_game")
.agg(
games=("result", "count"),
home_win_pct=("result", lambda x: (x > 0).mean() * 100),
home_cover_pct=("result", lambda x: ((x + games.loc[x.index, "spread_line"]) > 0).mean() * 100),
avg_total=("home_score", lambda x: (x + games.loc[x.index, "away_score"]).mean()),
avg_margin=("result", lambda x: x.abs().mean())
)
.reset_index())
div_comparison["is_division_game"] = div_comparison["is_division_game"].map({True: "Division", False: "Non-Division"})
print("Division vs Non-Division Game Analysis:")
print(div_comparison)
nflfastR
tidyverse
nfl_data_py
pandas
Fantasy football analysis, projections, and trade evaluations
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# PPR Fantasy Points calculation
# Passing: 0.04 per yard, 4 per TD, -2 per INT
# Rushing: 0.1 per yard, 6 per TD
# Receiving: 0.1 per yard, 6 per TD, 1 per reception
# QB Fantasy Points
qb_fantasy <- pbp %>%
filter(!is.na(passer_player_id)) %>%
group_by(passer_player_id, passer_player_name) %>%
summarize(
pass_yards = sum(passing_yards, na.rm = TRUE),
pass_tds = sum(pass_touchdown, na.rm = TRUE),
ints = sum(interception),
fantasy_pts = pass_yards * 0.04 + pass_tds * 4 - ints * 2,
.groups = "drop"
) %>%
arrange(desc(fantasy_pts))
# RB/WR/TE Fantasy Points (PPR)
skill_fantasy <- pbp %>%
filter(!is.na(rusher_player_id) | !is.na(receiver_player_id)) %>%
mutate(
player_id = coalesce(rusher_player_id, receiver_player_id),
player_name = coalesce(rusher_player_name, receiver_player_name)
) %>%
group_by(player_id, player_name) %>%
summarize(
rush_yards = sum(rushing_yards, na.rm = TRUE),
rush_tds = sum(rush_touchdown, na.rm = TRUE),
receptions = sum(complete_pass, na.rm = TRUE),
rec_yards = sum(receiving_yards, na.rm = TRUE),
rec_tds = sum(pass_touchdown[!is.na(receiver_player_id)], na.rm = TRUE),
fantasy_pts = rush_yards * 0.1 + rush_tds * 6 +
receptions * 1 + rec_yards * 0.1 + rec_tds * 6,
.groups = "drop"
) %>%
filter(fantasy_pts > 50) %>%
arrange(desc(fantasy_pts))
print(skill_fantasy %>% head(30))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# QB Fantasy Points
qb_plays = pbp[pbp["passer_player_id"].notna()]
qb_fantasy = (qb_plays.groupby(["passer_player_id", "passer_player_name"])
.agg(
pass_yards=("passing_yards", "sum"),
pass_tds=("pass_touchdown", "sum"),
ints=("interception", "sum")
)
.reset_index())
qb_fantasy["fantasy_pts"] = (qb_fantasy["pass_yards"] * 0.04 +
qb_fantasy["pass_tds"] * 4 -
qb_fantasy["ints"] * 2)
qb_fantasy = qb_fantasy.sort_values("fantasy_pts", ascending=False)
# Skill position fantasy points (simplified)
rush_plays = pbp[pbp["rusher_player_id"].notna()]
rush_pts = (rush_plays.groupby(["rusher_player_id", "rusher_player_name"])
.agg(
rush_yards=("rushing_yards", "sum"),
rush_tds=("rush_touchdown", "sum")
)
.reset_index())
rush_pts.columns = ["player_id", "player_name", "rush_yards", "rush_tds"]
rec_plays = pbp[pbp["receiver_player_id"].notna()]
rec_pts = (rec_plays.groupby(["receiver_player_id", "receiver_player_name"])
.agg(
receptions=("complete_pass", "sum"),
rec_yards=("receiving_yards", "sum"),
rec_tds=("pass_touchdown", "sum")
)
.reset_index())
rec_pts.columns = ["player_id", "player_name", "receptions", "rec_yards", "rec_tds"]
# Combine
skill_fantasy = rush_pts.merge(rec_pts, on=["player_id", "player_name"], how="outer").fillna(0)
skill_fantasy["fantasy_pts"] = (skill_fantasy["rush_yards"] * 0.1 +
skill_fantasy["rush_tds"] * 6 +
skill_fantasy["receptions"] * 1 +
skill_fantasy["rec_yards"] * 0.1 +
skill_fantasy["rec_tds"] * 6)
skill_fantasy = skill_fantasy[skill_fantasy["fantasy_pts"] > 50].sort_values("fantasy_pts", ascending=False)
print(skill_fantasy.head(30))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate target share by team
target_share <- pbp %>%
filter(play_type == "pass", !is.na(receiver_player_id)) %>%
group_by(posteam) %>%
mutate(team_targets = n()) %>%
group_by(posteam, receiver_player_id, receiver_player_name, team_targets) %>%
summarize(
targets = n(),
air_yards = sum(air_yards, na.rm = TRUE),
receptions = sum(complete_pass),
yards = sum(yards_gained, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
target_share = targets / team_targets,
air_yard_share = air_yards / sum(air_yards),
wopr = target_share * 1.5 + air_yard_share * 0.7 # Weighted Opportunity Rating
) %>%
arrange(desc(wopr))
# Top target share players
print(target_share %>%
select(receiver_player_name, posteam, target_share,
air_yard_share, wopr) %>%
head(25))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter to pass plays with receiver
pass_plays = pbp[(pbp["play_type"] == "pass") &
(pbp["receiver_player_id"].notna())]
# Team totals
team_targets = pass_plays.groupby("posteam").agg(
team_targets=("play_id", "count"),
team_air_yards=("air_yards", "sum")
).reset_index()
# Player stats
player_stats = (pass_plays.groupby(
["posteam", "receiver_player_id", "receiver_player_name"])
.agg(
targets=("play_id", "count"),
air_yards=("air_yards", "sum"),
receptions=("complete_pass", "sum"),
yards=("yards_gained", "sum")
)
.reset_index())
# Calculate shares
target_share = player_stats.merge(team_targets, on="posteam")
target_share["target_share"] = target_share["targets"] / target_share["team_targets"]
target_share["air_yard_share"] = target_share["air_yards"] / target_share["team_air_yards"]
target_share["wopr"] = target_share["target_share"] * 1.5 + target_share["air_yard_share"] * 0.7
result = target_share.nlargest(25, "wopr")[
["receiver_player_name", "posteam", "target_share", "air_yard_share", "wopr"]
]
print(result)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load participation data
participation <- load_participation(2023)
rosters <- fast_scraper_roster(2023)
# Calculate snap counts
snap_counts <- participation %>%
separate_rows(offense_players, sep = ";") %>%
filter(offense_players != "") %>%
group_by(nflverse_game_id, offense_players) %>%
summarize(snaps = n(), .groups = "drop") %>%
rename(gsis_id = offense_players)
# Join with roster for names
snap_counts <- snap_counts %>%
left_join(rosters %>% select(gsis_id, full_name, position, team),
by = "gsis_id")
# Calculate season totals
season_snaps <- snap_counts %>%
group_by(gsis_id, full_name, position, team) %>%
summarize(
games = n(),
total_snaps = sum(snaps),
avg_snaps = mean(snaps),
.groups = "drop"
) %>%
filter(position %in% c("RB", "WR", "TE")) %>%
arrange(desc(total_snaps))
print(season_snaps %>% head(30))
import nfl_data_py as nfl
import pandas as pd
# Load participation data
participation = nfl.import_snap_counts([2023])
rosters = nfl.import_rosters([2023])
# Aggregate by player
season_snaps = (participation.groupby("pfr_player_id")
.agg(
games=("week", "nunique"),
total_off_snaps=("offense_snaps", "sum"),
avg_off_snaps=("offense_snaps", "mean"),
total_def_snaps=("defense_snaps", "sum")
)
.reset_index())
# Join with roster data
roster_subset = rosters[["pfr_id", "player_name", "position", "team"]].drop_duplicates()
season_snaps = season_snaps.merge(roster_subset, left_on="pfr_player_id",
right_on="pfr_id", how="left")
# Filter skill positions
skill_snaps = season_snaps[season_snaps["position"].isin(["RB", "WR", "TE"])]
skill_snaps = skill_snaps.sort_values("total_off_snaps", ascending=False)
print("Top Snap Count Leaders (Skill Positions):")
print(skill_snaps.head(30))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate TD rate vs expected
td_analysis <- pbp %>%
filter(play_type == "pass", !is.na(receiver_player_id)) %>%
group_by(receiver_player_id, receiver_player_name) %>%
summarize(
targets = n(),
rz_targets = sum(yardline_100 <= 20, na.rm = TRUE),
tds = sum(pass_touchdown),
expected_td_rate = 0.10 * (rz_targets / targets), # Simplified expected
actual_td_rate = tds / targets,
.groups = "drop"
) %>%
filter(targets >= 50) %>%
mutate(
td_diff = actual_td_rate - expected_td_rate,
regression_direction = case_when(
td_diff > 0.03 ~ "Negative",
td_diff < -0.03 ~ "Positive",
TRUE ~ "Stable"
)
)
# Candidates for positive regression (underperformed)
positive_regression <- td_analysis %>%
filter(regression_direction == "Positive") %>%
arrange(td_diff)
print("Positive Regression Candidates (Due for more TDs):")
print(positive_regression %>% select(receiver_player_name, targets, tds, td_diff))
# Candidates for negative regression (overperformed)
negative_regression <- td_analysis %>%
filter(regression_direction == "Negative") %>%
arrange(desc(td_diff))
print("\nNegative Regression Candidates (TD rate unsustainable):")
print(negative_regression %>% select(receiver_player_name, targets, tds, td_diff))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Calculate TD rate analysis
pass_plays = pbp[(pbp["play_type"] == "pass") & (pbp["receiver_player_id"].notna())]
td_analysis = (pass_plays.groupby(["receiver_player_id", "receiver_player_name"])
.agg(
targets=("play_id", "count"),
rz_targets=("yardline_100", lambda x: (x <= 20).sum()),
tds=("pass_touchdown", "sum")
)
.reset_index())
td_analysis = td_analysis[td_analysis["targets"] >= 50]
td_analysis["rz_rate"] = td_analysis["rz_targets"] / td_analysis["targets"]
td_analysis["expected_td_rate"] = 0.10 * td_analysis["rz_rate"]
td_analysis["actual_td_rate"] = td_analysis["tds"] / td_analysis["targets"]
td_analysis["td_diff"] = td_analysis["actual_td_rate"] - td_analysis["expected_td_rate"]
# Positive regression candidates
positive = td_analysis[td_analysis["td_diff"] < -0.03].sort_values("td_diff")
print("Positive Regression Candidates (Due for more TDs):")
print(positive[["receiver_player_name", "targets", "tds", "td_diff"]].head(10))
# Negative regression candidates
negative = td_analysis[td_analysis["td_diff"] > 0.03].sort_values("td_diff", ascending=False)
print("\nNegative Regression Candidates (TD rate unsustainable):")
print(negative[["receiver_player_name", "targets", "tds", "td_diff"]].head(10))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Red zone targets analysis
rz_targets <- pbp %>%
filter(play_type == "pass", yardline_100 <= 20, !is.na(receiver_player_id)) %>%
group_by(receiver_player_id, receiver_player_name, posteam) %>%
summarize(
rz_targets = n(),
rz_receptions = sum(complete_pass),
rz_tds = sum(pass_touchdown),
avg_depth = mean(air_yards, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
rz_catch_rate = rz_receptions / rz_targets * 100,
rz_td_rate = rz_tds / rz_targets * 100
) %>%
arrange(desc(rz_targets))
print("Top Red Zone Target Leaders:")
print(rz_targets %>% head(25))
# Inside 10 yard line
goalline <- pbp %>%
filter(play_type == "pass", yardline_100 <= 10, !is.na(receiver_player_id)) %>%
group_by(receiver_player_name) %>%
summarize(
gl_targets = n(),
gl_tds = sum(pass_touchdown),
.groups = "drop"
) %>%
arrange(desc(gl_targets))
print("\nGoal Line Target Leaders (Inside 10):")
print(goalline %>% head(15))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Red zone targets
rz_plays = pbp[(pbp["play_type"] == "pass") &
(pbp["yardline_100"] <= 20) &
(pbp["receiver_player_id"].notna())]
rz_targets = (rz_plays.groupby(["receiver_player_id", "receiver_player_name", "posteam"])
.agg(
rz_targets=("play_id", "count"),
rz_receptions=("complete_pass", "sum"),
rz_tds=("pass_touchdown", "sum"),
avg_depth=("air_yards", "mean")
)
.reset_index())
rz_targets["rz_catch_rate"] = rz_targets["rz_receptions"] / rz_targets["rz_targets"] * 100
rz_targets["rz_td_rate"] = rz_targets["rz_tds"] / rz_targets["rz_targets"] * 100
rz_targets = rz_targets.sort_values("rz_targets", ascending=False)
print("Top Red Zone Target Leaders:")
print(rz_targets.head(25))
# Goal line targets
gl_plays = pbp[(pbp["play_type"] == "pass") &
(pbp["yardline_100"] <= 10) &
(pbp["receiver_player_id"].notna())]
goalline = (gl_plays.groupby("receiver_player_name")
.agg(gl_targets=("play_id", "count"), gl_tds=("pass_touchdown", "sum"))
.reset_index()
.sort_values("gl_targets", ascending=False))
print("\nGoal Line Target Leaders:")
print(goalline.head(15))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate fantasy points per game
fantasy_ppg <- pbp %>%
filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
mutate(
player_id = coalesce(receiver_player_id, rusher_player_id),
player_name = coalesce(receiver_player_name, rusher_player_name)
) %>%
group_by(player_id, player_name, week) %>%
summarize(
rush_yards = sum(rushing_yards, na.rm = TRUE),
rush_tds = sum(rush_touchdown, na.rm = TRUE),
receptions = sum(complete_pass, na.rm = TRUE),
rec_yards = sum(receiving_yards, na.rm = TRUE),
rec_tds = sum(touchdown[!is.na(receiver_player_id)], na.rm = TRUE),
fantasy_pts = rush_yards * 0.1 + rush_tds * 6 + receptions * 1 +
rec_yards * 0.1 + rec_tds * 6,
.groups = "drop"
)
# Calculate consistency metrics
consistency <- fantasy_ppg %>%
group_by(player_id, player_name) %>%
summarize(
games = n(),
avg_pts = mean(fantasy_pts),
std_pts = sd(fantasy_pts),
floor = quantile(fantasy_pts, 0.1),
ceiling = quantile(fantasy_pts, 0.9),
boom_rate = mean(fantasy_pts > 15) * 100, # "Boom" games
bust_rate = mean(fantasy_pts < 5) * 100, # "Bust" games
.groups = "drop"
) %>%
filter(games >= 8) %>%
mutate(
consistency_score = avg_pts / (std_pts + 1), # Higher is better
upside_score = ceiling / avg_pts # Higher ceiling relative to average
) %>%
arrange(desc(consistency_score))
print("Most Consistent Fantasy Performers:")
print(consistency %>% select(player_name, avg_pts, boom_rate, bust_rate, consistency_score) %>% head(20))
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Calculate weekly fantasy points
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])
weekly = (skill_plays.groupby(["player_id", "player_name", "week"])
.agg(
rush_yards=("rushing_yards", "sum"),
rush_tds=("rush_touchdown", "sum"),
receptions=("complete_pass", "sum"),
rec_yards=("receiving_yards", "sum"),
rec_tds=("pass_touchdown", "sum")
)
.reset_index())
weekly["fantasy_pts"] = (weekly["rush_yards"] * 0.1 + weekly["rush_tds"] * 6 +
weekly["receptions"] * 1 + weekly["rec_yards"] * 0.1 +
weekly["rec_tds"] * 6)
# Consistency metrics
consistency = (weekly.groupby(["player_id", "player_name"])
.agg(
games=("fantasy_pts", "count"),
avg_pts=("fantasy_pts", "mean"),
std_pts=("fantasy_pts", "std"),
floor=("fantasy_pts", lambda x: x.quantile(0.1)),
ceiling=("fantasy_pts", lambda x: x.quantile(0.9)),
boom_rate=("fantasy_pts", lambda x: (x > 15).mean() * 100),
bust_rate=("fantasy_pts", lambda x: (x < 5).mean() * 100)
)
.reset_index())
consistency = consistency[consistency["games"] >= 8]
consistency["consistency_score"] = consistency["avg_pts"] / (consistency["std_pts"] + 1)
consistency = consistency.sort_values("consistency_score", ascending=False)
print("Most Consistent Fantasy Performers:")
print(consistency[["player_name", "avg_pts", "boom_rate", "bust_rate", "consistency_score"]].head(20))
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
rosters <- fast_scraper_roster(2023)
# Get opportunity metrics
opportunities <- pbp %>%
filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
mutate(
player_id = coalesce(receiver_player_id, rusher_player_id),
player_name = coalesce(receiver_player_name, rusher_player_name),
is_target = !is.na(receiver_player_id),
is_carry = !is.na(rusher_player_id) & play_type == "run"
) %>%
group_by(player_id, player_name, posteam) %>%
summarize(
games = n_distinct(game_id),
targets = sum(is_target),
carries = sum(is_carry),
touches = targets + carries,
fantasy_pts = sum(rushing_yards, na.rm = TRUE) * 0.1 +
sum(rush_touchdown, na.rm = TRUE) * 6 +
sum(complete_pass, na.rm = TRUE) * 1 +
sum(receiving_yards, na.rm = TRUE) * 0.1 +
sum(pass_touchdown[is_target], na.rm = TRUE) * 6,
.groups = "drop"
) %>%
mutate(
touches_per_game = touches / games,
pts_per_touch = fantasy_pts / touches,
pts_per_game = fantasy_pts / games,
# Project remaining games (assuming 17-game season)
games_remaining = 17 - games,
projected_ros_pts = pts_per_game * games_remaining
) %>%
filter(games >= 5, touches >= 30) %>%
arrange(desc(projected_ros_pts))
print("Rest of Season Fantasy Rankings:")
print(opportunities %>%
select(player_name, games, pts_per_game, games_remaining, projected_ros_pts) %>%
head(30))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Get opportunity metrics
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])
skill_plays["is_target"] = skill_plays["receiver_player_id"].notna()
skill_plays["is_carry"] = (skill_plays["rusher_player_id"].notna()) & (skill_plays["play_type"] == "run")
opportunities = (skill_plays.groupby(["player_id", "player_name", "posteam"])
.agg(
games=("game_id", "nunique"),
targets=("is_target", "sum"),
carries=("is_carry", "sum"),
rush_yards=("rushing_yards", "sum"),
rush_tds=("rush_touchdown", "sum"),
receptions=("complete_pass", "sum"),
rec_yards=("receiving_yards", "sum"),
rec_tds=("pass_touchdown", "sum")
)
.reset_index())
opportunities["touches"] = opportunities["targets"] + opportunities["carries"]
opportunities["fantasy_pts"] = (opportunities["rush_yards"] * 0.1 +
opportunities["rush_tds"] * 6 +
opportunities["receptions"] * 1 +
opportunities["rec_yards"] * 0.1 +
opportunities["rec_tds"] * 6)
opportunities["pts_per_game"] = opportunities["fantasy_pts"] / opportunities["games"]
opportunities["games_remaining"] = 17 - opportunities["games"]
opportunities["projected_ros_pts"] = opportunities["pts_per_game"] * opportunities["games_remaining"]
opportunities = opportunities[(opportunities["games"] >= 5) & (opportunities["touches"] >= 30)]
opportunities = opportunities.sort_values("projected_ros_pts", ascending=False)
print("Rest of Season Fantasy Rankings:")
print(opportunities[["player_name", "games", "pts_per_game", "games_remaining", "projected_ros_pts"]].head(30))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate player values
player_values <- pbp %>%
filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
mutate(
player_id = coalesce(receiver_player_id, rusher_player_id),
player_name = coalesce(receiver_player_name, rusher_player_name)
) %>%
group_by(player_id, player_name, posteam) %>%
summarize(
games = n_distinct(game_id),
fantasy_pts = sum(rushing_yards, na.rm = TRUE) * 0.1 +
sum(rush_touchdown, na.rm = TRUE) * 6 +
sum(complete_pass, na.rm = TRUE) * 1 +
sum(receiving_yards, na.rm = TRUE) * 0.1 +
sum(pass_touchdown[!is.na(receiver_player_id)], na.rm = TRUE) * 6,
.groups = "drop"
) %>%
mutate(ppg = fantasy_pts / games) %>%
filter(games >= 5)
# Create trade value scale (1-100)
player_values <- player_values %>%
mutate(
# Base value on PPG percentile
value_percentile = percent_rank(ppg),
# Scale to 1-100
trade_value = round(value_percentile * 100),
# Tier assignment
tier = case_when(
trade_value >= 90 ~ "Elite (Tier 1)",
trade_value >= 75 ~ "Star (Tier 2)",
trade_value >= 60 ~ "Starter (Tier 3)",
trade_value >= 40 ~ "Flex (Tier 4)",
TRUE ~ "Bench (Tier 5)"
)
) %>%
arrange(desc(trade_value))
print("Fantasy Trade Values:")
print(player_values %>%
select(player_name, ppg, trade_value, tier) %>%
head(40))
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Calculate player values
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])
player_values = (skill_plays.groupby(["player_id", "player_name", "posteam"])
.agg(
games=("game_id", "nunique"),
rush_yards=("rushing_yards", "sum"),
rush_tds=("rush_touchdown", "sum"),
receptions=("complete_pass", "sum"),
rec_yards=("receiving_yards", "sum"),
rec_tds=("pass_touchdown", "sum")
)
.reset_index())
player_values["fantasy_pts"] = (player_values["rush_yards"] * 0.1 +
player_values["rush_tds"] * 6 +
player_values["receptions"] * 1 +
player_values["rec_yards"] * 0.1 +
player_values["rec_tds"] * 6)
player_values["ppg"] = player_values["fantasy_pts"] / player_values["games"]
player_values = player_values[player_values["games"] >= 5]
# Create trade value scale
player_values["value_percentile"] = player_values["ppg"].rank(pct=True)
player_values["trade_value"] = (player_values["value_percentile"] * 100).round()
def get_tier(val):
if val >= 90: return "Elite (Tier 1)"
elif val >= 75: return "Star (Tier 2)"
elif val >= 60: return "Starter (Tier 3)"
elif val >= 40: return "Flex (Tier 4)"
else: return "Bench (Tier 5)"
player_values["tier"] = player_values["trade_value"].apply(get_tier)
player_values = player_values.sort_values("trade_value", ascending=False)
print("Fantasy Trade Values:")
print(player_values[["player_name", "ppg", "trade_value", "tier"]].head(40))
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate defense vs position stats
def_vs_rb <- pbp %>%
filter(play_type == "run", !is.na(epa)) %>%
group_by(defteam) %>%
summarize(
plays = n(),
rush_epa_allowed = mean(epa),
rush_ypc_allowed = mean(yards_gained),
rush_td_allowed = sum(rush_touchdown),
.groups = "drop"
) %>%
mutate(
rb_rank = rank(rush_epa_allowed) # Lower EPA = better defense
)
def_vs_wr <- pbp %>%
filter(play_type == "pass", !is.na(epa)) %>%
group_by(defteam) %>%
summarize(
plays = n(),
pass_epa_allowed = mean(epa),
ypa_allowed = mean(yards_gained),
pass_td_allowed = sum(pass_touchdown),
.groups = "drop"
) %>%
mutate(
wr_rank = rank(pass_epa_allowed) # Lower EPA = better defense
)
# Combine defensive rankings
def_rankings <- def_vs_rb %>%
select(defteam, rush_epa_allowed, rb_rank) %>%
left_join(def_vs_wr %>% select(defteam, pass_epa_allowed, wr_rank), by = "defteam") %>%
mutate(
rb_matchup = case_when(
rb_rank <= 8 ~ "Tough",
rb_rank <= 24 ~ "Average",
TRUE ~ "Favorable"
),
wr_matchup = case_when(
wr_rank <= 8 ~ "Tough",
wr_rank <= 24 ~ "Average",
TRUE ~ "Favorable"
)
)
print("Defensive Matchup Rankings:")
print(def_rankings %>% arrange(rb_rank))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Defense vs RB
rushes = pbp[(pbp["play_type"] == "run") & (pbp["epa"].notna())]
def_vs_rb = (rushes.groupby("defteam")
.agg(
plays=("epa", "count"),
rush_epa_allowed=("epa", "mean"),
rush_ypc=("yards_gained", "mean"),
rush_td=("rush_touchdown", "sum")
)
.reset_index())
def_vs_rb["rb_rank"] = def_vs_rb["rush_epa_allowed"].rank()
# Defense vs WR
passes = pbp[(pbp["play_type"] == "pass") & (pbp["epa"].notna())]
def_vs_wr = (passes.groupby("defteam")
.agg(
plays=("epa", "count"),
pass_epa_allowed=("epa", "mean"),
ypa=("yards_gained", "mean"),
pass_td=("pass_touchdown", "sum")
)
.reset_index())
def_vs_wr["wr_rank"] = def_vs_wr["pass_epa_allowed"].rank()
# Combine
def_rankings = def_vs_rb[["defteam", "rush_epa_allowed", "rb_rank"]].merge(
def_vs_wr[["defteam", "pass_epa_allowed", "wr_rank"]], on="defteam")
def rb_matchup(rank):
if rank <= 8: return "Tough"
elif rank <= 24: return "Average"
else: return "Favorable"
def_rankings["rb_matchup"] = def_rankings["rb_rank"].apply(rb_matchup)
def_rankings["wr_matchup"] = def_rankings["wr_rank"].apply(rb_matchup)
print("Defensive Matchup Rankings:")
print(def_rankings.sort_values("rb_rank"))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Find players with increasing opportunity
weekly_touches <- pbp %>%
filter(!is.na(receiver_player_id) | !is.na(rusher_player_id)) %>%
mutate(
player_id = coalesce(receiver_player_id, rusher_player_id),
player_name = coalesce(receiver_player_name, rusher_player_name),
is_touch = TRUE
) %>%
group_by(player_id, player_name, week, posteam) %>%
summarize(
touches = n(),
fantasy_pts = sum(rushing_yards, na.rm = TRUE) * 0.1 +
sum(rush_touchdown, na.rm = TRUE) * 6 +
sum(complete_pass, na.rm = TRUE) * 1 +
sum(receiving_yards, na.rm = TRUE) * 0.1 +
sum(pass_touchdown[!is.na(receiver_player_id)], na.rm = TRUE) * 6,
.groups = "drop"
)
# Calculate trend (last 3 weeks vs first 3 weeks)
max_week <- max(weekly_touches$week)
trending <- weekly_touches %>%
mutate(
period = case_when(
week >= max_week - 2 ~ "Recent",
week <= 3 ~ "Early",
TRUE ~ "Middle"
)
) %>%
filter(period %in% c("Recent", "Early")) %>%
group_by(player_id, player_name, posteam, period) %>%
summarize(
avg_touches = mean(touches),
avg_pts = mean(fantasy_pts),
.groups = "drop"
) %>%
pivot_wider(names_from = period, values_from = c(avg_touches, avg_pts)) %>%
filter(!is.na(avg_touches_Recent), !is.na(avg_touches_Early)) %>%
mutate(
touch_trend = avg_touches_Recent - avg_touches_Early,
pts_trend = avg_pts_Recent - avg_pts_Early
) %>%
filter(avg_pts_Recent < 15) %>% # Filter out established stars
arrange(desc(touch_trend))
print("Waiver Wire Gems (Increasing Opportunity):")
print(trending %>%
select(player_name, avg_touches_Early, avg_touches_Recent, touch_trend, pts_trend) %>%
head(15))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Get weekly touches
skill_plays = pbp[(pbp["receiver_player_id"].notna()) | (pbp["rusher_player_id"].notna())].copy()
skill_plays["player_id"] = skill_plays["receiver_player_id"].fillna(skill_plays["rusher_player_id"])
skill_plays["player_name"] = skill_plays["receiver_player_name"].fillna(skill_plays["rusher_player_name"])
weekly = (skill_plays.groupby(["player_id", "player_name", "week", "posteam"])
.agg(
touches=("play_id", "count"),
rush_yards=("rushing_yards", "sum"),
rush_tds=("rush_touchdown", "sum"),
receptions=("complete_pass", "sum"),
rec_yards=("receiving_yards", "sum"),
rec_tds=("pass_touchdown", "sum")
)
.reset_index())
weekly["fantasy_pts"] = (weekly["rush_yards"] * 0.1 + weekly["rush_tds"] * 6 +
weekly["receptions"] * 1 + weekly["rec_yards"] * 0.1 +
weekly["rec_tds"] * 6)
# Calculate trends
max_week = weekly["week"].max()
early = weekly[weekly["week"] <= 3].groupby(["player_id", "player_name"]).agg(
avg_touches_early=("touches", "mean"),
avg_pts_early=("fantasy_pts", "mean")
).reset_index()
recent = weekly[weekly["week"] >= max_week - 2].groupby(["player_id", "player_name"]).agg(
avg_touches_recent=("touches", "mean"),
avg_pts_recent=("fantasy_pts", "mean")
).reset_index()
trending = early.merge(recent, on=["player_id", "player_name"])
trending["touch_trend"] = trending["avg_touches_recent"] - trending["avg_touches_early"]
trending["pts_trend"] = trending["avg_pts_recent"] - trending["avg_pts_early"]
# Filter out established stars
trending = trending[trending["avg_pts_recent"] < 15]
trending = trending.sort_values("touch_trend", ascending=False)
print("Waiver Wire Gems (Increasing Opportunity):")
print(trending[["player_name", "avg_touches_early", "avg_touches_recent", "touch_trend", "pts_trend"]].head(15))
nflfastR
tidyverse
nfl_data_py
pandas
Apply ML techniques for predictions, clustering, and pattern recognition
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Prepare training data
model_data <- pbp %>%
filter(!is.na(score_differential), !is.na(wp)) %>%
select(
home_team, away_team, posteam,
game_seconds_remaining, score_differential,
down, ydstogo, yardline_100,
posteam_timeouts_remaining, defteam_timeouts_remaining,
receive_2h_ko,
wp # Target variable (use as proxy for outcome)
) %>%
mutate(
is_home = posteam == home_team,
qtr = 4 - (game_seconds_remaining %/% 900)
) %>%
drop_na()
# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]
# Fit logistic regression
model <- glm(
wp ~ score_differential + game_seconds_remaining +
is_home + down + ydstogo + yardline_100 +
posteam_timeouts_remaining,
data = train,
family = "quasibinomial"
)
# Evaluate
test$pred_wp <- predict(model, test, type = "response")
test$correct <- (test$pred_wp > 0.5) == (test$wp > 0.5)
cat("Accuracy:", mean(test$correct) * 100, "%\n")
cat("Brier Score:", mean((test$pred_wp - test$wp)^2), "\n")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, brier_score_loss
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
pbp["score_differential"].notna() &
pbp["wp"].notna()
][["home_team", "posteam", "game_seconds_remaining",
"score_differential", "down", "ydstogo", "yardline_100",
"posteam_timeouts_remaining", "wp"]].dropna()
model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)
# Features and target
features = ["score_differential", "game_seconds_remaining", "is_home",
"down", "ydstogo", "yardline_100", "posteam_timeouts_remaining"]
X = model_data[features]
y = (model_data["wp"] > 0.5).astype(int)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Fit model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Brier Score: {brier_score_loss(y_test, y_prob):.4f}")
nflfastR
tidyverse
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Create receiver profiles
receiver_profiles <- pbp %>%
filter(!is.na(receiver_player_id), play_type == "pass") %>%
group_by(receiver_player_id, receiver_player_name) %>%
summarize(
targets = n(),
catch_rate = mean(complete_pass),
avg_depth = mean(air_yards, na.rm = TRUE),
yac_per_rec = mean(yards_after_catch[complete_pass == 1], na.rm = TRUE),
contested_rate = mean(defenders_in_box >= 7, na.rm = TRUE),
.groups = "drop"
) %>%
filter(targets >= 40) %>%
drop_na()
# Scale features
features <- receiver_profiles %>%
select(catch_rate, avg_depth, yac_per_rec)
scaled_features <- scale(features)
# K-means clustering
set.seed(42)
kmeans_result <- kmeans(scaled_features, centers = 4, nstart = 25)
# Add clusters to data
receiver_profiles$cluster <- factor(kmeans_result$cluster)
# Analyze clusters
cluster_summary <- receiver_profiles %>%
group_by(cluster) %>%
summarize(
n_players = n(),
avg_catch_rate = mean(catch_rate),
avg_depth = mean(avg_depth),
avg_yac = mean(yac_per_rec)
)
print(cluster_summary)
# Name clusters based on characteristics
# Cluster 1: Deep threats (high depth)
# Cluster 2: Possession receivers (high catch rate)
# etc.
import nfl_data_py as nfl
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
pbp = nfl.import_pbp_data([2023])
# Create receiver profiles
pass_plays = pbp[(pbp["receiver_player_id"].notna()) &
(pbp["play_type"] == "pass")]
receiver_profiles = (pass_plays.groupby(
["receiver_player_id", "receiver_player_name"])
.agg(
targets=("play_id", "count"),
catch_rate=("complete_pass", "mean"),
avg_depth=("air_yards", "mean"),
yac_per_rec=("yards_after_catch", lambda x: x[pass_plays.loc[x.index, "complete_pass"] == 1].mean())
)
.reset_index()
.dropna())
receiver_profiles = receiver_profiles[receiver_profiles["targets"] >= 40]
# Scale features
features = receiver_profiles[["catch_rate", "avg_depth", "yac_per_rec"]]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42, n_init=25)
receiver_profiles["cluster"] = kmeans.fit_predict(scaled_features)
# Analyze clusters
cluster_summary = (receiver_profiles.groupby("cluster")
.agg(
n_players=("receiver_player_name", "count"),
avg_catch_rate=("catch_rate", "mean"),
avg_depth=("avg_depth", "mean"),
avg_yac=("yac_per_rec", "mean")
)
.reset_index())
print(cluster_summary)
nflfastR
tidyverse
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(randomForest)
pbp <- load_pbp(2023)
# Prepare data for play prediction
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
mutate(
is_pass = as.factor(play_type == "pass"),
is_home = posteam == home_team
) %>%
select(is_pass, down, ydstogo, yardline_100, score_differential,
game_seconds_remaining, is_home) %>%
drop_na()
# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]
# Train random forest
rf_model <- randomForest(
is_pass ~ .,
data = train,
ntree = 100,
importance = TRUE
)
# Evaluate
test$pred <- predict(rf_model, test)
accuracy <- mean(test$pred == test$is_pass)
cat("Accuracy:", accuracy * 100, "%\n")
# Feature importance
importance(rf_model) %>% as.data.frame() %>%
rownames_to_column("feature") %>%
arrange(desc(MeanDecreaseGini)) %>%
print()
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
model_data["is_home"] = (model_data["posteam"] == model_data["home_team"]).astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential",
"game_seconds_remaining", "is_home"]
model_data = model_data[features + ["is_pass"]].dropna()
# Split data
X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Evaluate
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
importance = pd.DataFrame({
"feature": features,
"importance": rf.feature_importances_
}).sort_values("importance", ascending=False)
print(importance)
nflfastR
tidyverse
randomForest
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(xgboost)
schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)
# Calculate team metrics per season
team_metrics <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(season, posteam) %>%
summarize(
off_epa = mean(epa),
pass_rate = mean(play_type == "pass"),
.groups = "drop"
) %>%
left_join(
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(season, defteam) %>%
summarize(def_epa = mean(epa), .groups = "drop"),
by = c("season", "posteam" = "defteam")
)
# Create game features
games <- schedules %>%
filter(!is.na(result)) %>%
left_join(team_metrics, by = c("season", "home_team" = "posteam")) %>%
rename(home_off = off_epa, home_def = def_epa, home_pass = pass_rate) %>%
left_join(team_metrics, by = c("season", "away_team" = "posteam")) %>%
rename(away_off = off_epa, away_def = def_epa, away_pass = pass_rate) %>%
mutate(home_win = as.numeric(result > 0)) %>%
select(home_off, home_def, home_pass, away_off, away_def, away_pass, home_win) %>%
drop_na()
# Prepare for XGBoost
train_matrix <- as.matrix(games %>% select(-home_win))
labels <- games$home_win
# Train XGBoost
xgb_model <- xgboost(
data = train_matrix,
label = labels,
nrounds = 100,
objective = "binary:logistic",
verbose = 0
)
# Feature importance
importance <- xgb.importance(model = xgb_model)
print(importance)
import nfl_data_py as nfl
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])
# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
team_off = (plays.groupby(["season", "posteam"])
.agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
.reset_index())
team_def = (plays.groupby(["season", "defteam"])
.agg(def_epa=("epa", "mean"))
.reset_index())
team_metrics = team_off.merge(team_def, left_on=["season", "posteam"],
right_on=["season", "defteam"])
# Create game features
games = schedules[schedules["result"].notna()].copy()
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def", "pass_rate": "home_pass"})
games = games.merge(team_metrics[["season", "posteam", "off_epa", "def_epa", "pass_rate"]],
left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def", "pass_rate": "away_pass"})
games["home_win"] = (games["result"] > 0).astype(int)
features = ["home_off", "home_def", "home_pass", "away_off", "away_def", "away_pass"]
games = games[features + ["home_win"]].dropna()
# Train XGBoost
X_train, X_test, y_train, y_test = train_test_split(
games[features], games["home_win"], test_size=0.2, random_state=42)
model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print("\nFeature Importance:")
print(pd.DataFrame({"feature": features, "importance": model.feature_importances_}).sort_values("importance", ascending=False))
nflfastR
tidyverse
xgboost
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Create advanced features
features <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
mutate(
# Situational features
is_neutral_game_script = abs(score_differential) <= 7,
is_garbage_time = (qtr == 4 & abs(score_differential) > 21) |
(qtr >= 4 & game_seconds_remaining < 120 & abs(score_differential) > 14),
is_two_minute = game_seconds_remaining <= 120 & qtr %in% c(2, 4),
is_redzone = yardline_100 <= 20,
is_goal_to_go = goal_to_go == 1,
is_third_or_fourth = down >= 3,
# Distance buckets
distance_bucket = case_when(
ydstogo <= 3 ~ "short",
ydstogo <= 7 ~ "medium",
ydstogo <= 10 ~ "long",
TRUE ~ "very_long"
),
# Field position buckets
field_bucket = case_when(
yardline_100 >= 80 ~ "own_deep",
yardline_100 >= 50 ~ "own_side",
yardline_100 >= 20 ~ "opponent",
TRUE ~ "redzone"
),
# Time features
game_pct_complete = 1 - (game_seconds_remaining / 3600),
half = if_else(qtr <= 2, 1, 2),
# Rolling metrics (would need window function)
pass_heavy = NA # Placeholder for rolling pass rate
)
# Show feature distribution
features %>%
group_by(distance_bucket, field_bucket) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass"),
avg_epa = mean(epa),
.groups = "drop"
) %>%
print()
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
# Create advanced features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))].copy()
# Situational features
plays["is_neutral"] = plays["score_differential"].abs() <= 7
plays["is_garbage_time"] = (
((plays["qtr"] == 4) & (plays["score_differential"].abs() > 21)) |
((plays["qtr"] >= 4) & (plays["game_seconds_remaining"] < 120) &
(plays["score_differential"].abs() > 14))
)
plays["is_two_minute"] = (plays["game_seconds_remaining"] <= 120) & (plays["qtr"].isin([2, 4]))
plays["is_redzone"] = plays["yardline_100"] <= 20
plays["is_third_or_fourth"] = plays["down"] >= 3
# Distance buckets
def distance_bucket(yd):
if yd <= 3: return "short"
elif yd <= 7: return "medium"
elif yd <= 10: return "long"
else: return "very_long"
plays["distance_bucket"] = plays["ydstogo"].apply(distance_bucket)
# Field position buckets
def field_bucket(yd):
if yd >= 80: return "own_deep"
elif yd >= 50: return "own_side"
elif yd >= 20: return "opponent"
else: return "redzone"
plays["field_bucket"] = plays["yardline_100"].apply(field_bucket)
# Time features
plays["game_pct_complete"] = 1 - (plays["game_seconds_remaining"] / 3600)
plays["half"] = np.where(plays["qtr"] <= 2, 1, 2)
# Show feature distribution
feature_dist = (plays.groupby(["distance_bucket", "field_bucket"])
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean()),
avg_epa=("epa", "mean")
)
.reset_index())
print(feature_dist)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
library(caret)
pbp <- load_pbp(2020:2023)
# Prepare data
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down), !is.na(epa)) %>%
mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE))) %>%
select(season, week, is_pass, down, ydstogo, yardline_100,
score_differential, game_seconds_remaining) %>%
drop_na()
# Time-series cross-validation (train on past, test on future)
results <- list()
for (test_year in 2021:2023) {
train <- model_data %>% filter(season < test_year)
test <- model_data %>% filter(season == test_year)
model <- glm(is_pass ~ down + ydstogo + yardline_100 +
score_differential + game_seconds_remaining,
data = train, family = "binomial")
preds <- predict(model, test, type = "response")
pred_class <- ifelse(preds > 0.5, TRUE, FALSE)
accuracy <- mean(pred_class == test$is_pass)
results[[as.character(test_year)]] <- accuracy
cat("Test Year:", test_year, "- Accuracy:", round(accuracy * 100, 1), "%\n")
}
# Average performance
cat("\nAverage Accuracy:", round(mean(unlist(results)) * 100, 1), "%\n")
# Week-by-week validation (within season)
# This accounts for within-season learning
week_results <- model_data %>%
filter(season == 2023) %>%
group_by(week) %>%
group_split() %>%
map_dbl(function(test_week) {
w <- unique(test_week$week)
train <- model_data %>% filter(season == 2023, week < w)
if (nrow(train) < 100) return(NA)
model <- glm(is_pass ~ ., data = train %>% select(-season, -week),
family = "binomial")
preds <- predict(model, test_week, type = "response") > 0.5
mean(preds == test_week$is_pass)
})
cat("\nWeek-by-Week Validation (2023):")
print(week_results, na.print = "N/A")
import nfl_data_py as nfl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna()) &
(pbp["epa"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[["season", "week"] + features + ["is_pass"]].dropna()
# Time-series cross-validation (train on past, test on future)
results = []
for test_year in [2021, 2022, 2023]:
train = model_data[model_data["season"] < test_year]
test = model_data[model_data["season"] == test_year]
X_train, y_train = train[features], train["is_pass"]
X_test, y_test = test[features], test["is_pass"]
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results.append(accuracy)
print(f"Test Year: {test_year} - Accuracy: {accuracy * 100:.1f}%")
print(f"\nAverage Accuracy: {sum(results) / len(results) * 100:.1f}%")
# Alternative: sklearn TimeSeriesSplit
print("\nTimeSeriesSplit (3-fold):")
tscv = TimeSeriesSplit(n_splits=3)
X = model_data[features]
y = model_data["is_pass"]
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(f"Fold {i+1}: {acc * 100:.1f}%")
nflfastR
tidyverse
caret
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(pROC)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Build a simple win probability prediction
model_data <- pbp %>%
filter(!is.na(wp), !is.na(score_differential)) %>%
select(game_id, play_id, wp, score_differential,
game_seconds_remaining, posteam) %>%
drop_na()
# Create actual outcome
game_outcomes <- schedules %>%
filter(!is.na(result)) %>%
mutate(home_win = result > 0) %>%
select(game_id, home_team, home_win)
model_data <- model_data %>%
left_join(game_outcomes, by = "game_id") %>%
mutate(actual_win = (posteam == home_team & home_win) |
(posteam != home_team & !home_win))
# Simple model based on score differential and time
model <- glm(actual_win ~ score_differential + game_seconds_remaining,
data = model_data, family = "binomial")
model_data$pred_wp <- predict(model, type = "response")
# Evaluation metrics
# 1. Brier Score (lower is better)
brier <- mean((model_data$pred_wp - model_data$actual_win)^2)
cat("Brier Score:", round(brier, 4), "\n")
# 2. Log Loss
log_loss <- -mean(model_data$actual_win * log(model_data$pred_wp + 1e-10) +
(1 - model_data$actual_win) * log(1 - model_data$pred_wp + 1e-10))
cat("Log Loss:", round(log_loss, 4), "\n")
# 3. AUC-ROC
roc_obj <- roc(model_data$actual_win, model_data$pred_wp, quiet = TRUE)
cat("AUC-ROC:", round(auc(roc_obj), 4), "\n")
# 4. Calibration by probability bucket
calibration <- model_data %>%
mutate(wp_bucket = cut(pred_wp, breaks = seq(0, 1, 0.1))) %>%
group_by(wp_bucket) %>%
summarize(
n = n(),
predicted = mean(pred_wp),
actual = mean(actual_win)
)
print("\nCalibration by Probability Bucket:")
print(calibration)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (brier_score_loss, log_loss, roc_auc_score,
accuracy_score, confusion_matrix)
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Prepare data
model_data = pbp[
(pbp["wp"].notna()) &
(pbp["score_differential"].notna())
][["game_id", "wp", "score_differential", "game_seconds_remaining", "posteam"]].dropna()
# Get actual outcomes
outcomes = schedules[schedules["result"].notna()][["game_id", "home_team", "result"]].copy()
outcomes["home_win"] = outcomes["result"] > 0
model_data = model_data.merge(outcomes, on="game_id")
model_data["actual_win"] = (
((model_data["posteam"] == model_data["home_team"]) & model_data["home_win"]) |
((model_data["posteam"] != model_data["home_team"]) & ~model_data["home_win"])
).astype(int)
# Simple model
X = model_data[["score_differential", "game_seconds_remaining"]]
y = model_data["actual_win"]
model = LogisticRegression(max_iter=1000)
model.fit(X, y)
pred_wp = model.predict_proba(X)[:, 1]
# Evaluation metrics
print("Model Evaluation Metrics:")
print(f"Brier Score: {brier_score_loss(y, pred_wp):.4f}")
print(f"Log Loss: {log_loss(y, pred_wp):.4f}")
print(f"AUC-ROC: {roc_auc_score(y, pred_wp):.4f}")
print(f"Accuracy: {accuracy_score(y, (pred_wp > 0.5).astype(int)) * 100:.1f}%")
# Calibration
model_data["pred_wp"] = pred_wp
model_data["wp_bucket"] = pd.cut(pred_wp, bins=np.arange(0, 1.1, 0.1))
calibration = (model_data.groupby("wp_bucket")
.agg(n=("actual_win", "count"),
predicted=("pred_wp", "mean"),
actual=("actual_win", "mean"))
.reset_index())
print("\nCalibration by Probability Bucket:")
print(calibration)
nflfastR
tidyverse
pROC
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(caret)
pbp <- load_pbp(2023)
# Prepare data
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
mutate(is_pass = factor(play_type == "pass")) %>%
select(is_pass, down, ydstogo, yardline_100,
score_differential, game_seconds_remaining) %>%
drop_na() %>%
sample_n(min(10000, n())) # Subsample for speed
# Define hyperparameter grid for Random Forest
rf_grid <- expand.grid(
mtry = c(2, 3, 4, 5)
)
# Cross-validation control
train_control <- trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Train with grid search
set.seed(42)
rf_tuned <- train(
is_pass ~ .,
data = model_data,
method = "rf",
trControl = train_control,
tuneGrid = rf_grid,
metric = "ROC",
ntree = 50
)
# Results
print(rf_tuned)
print(rf_tuned$bestTune)
# Plot tuning results
plot(rf_tuned)
# Best model performance
confusionMatrix(rf_tuned)
import nfl_data_py as nfl
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()
# Subsample for speed
model_data = model_data.sample(n=min(10000, len(model_data)), random_state=42)
X = model_data[features]
y = model_data["is_pass"]
# Define hyperparameter grid
param_grid = {
"n_estimators": [50, 100, 200],
"max_depth": [5, 10, 15, None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
# Smaller grid for faster demo
small_grid = {
"n_estimators": [50, 100],
"max_depth": [5, 10],
"min_samples_split": [2, 5]
}
# Grid search with cross-validation
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
rf,
small_grid,
cv=5,
scoring="roc_auc",
n_jobs=-1,
verbose=1
)
grid_search.fit(X, y)
# Results
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score (AUC): {grid_search.best_score_:.4f}")
# All results
results = pd.DataFrame(grid_search.cv_results_)
print("\nTop 5 Parameter Combinations:")
print(results.nsmallest(5, "rank_test_score")[["params", "mean_test_score", "std_test_score"]])
nflfastR
tidyverse
caret
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(caret)
library(randomForest)
pbp <- load_pbp(2023)
# Prepare data
model_data <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
mutate(is_pass = factor(play_type == "pass", levels = c(FALSE, TRUE), labels = c("run", "pass"))) %>%
select(is_pass, down, ydstogo, yardline_100,
score_differential, game_seconds_remaining) %>%
drop_na() %>%
sample_n(min(5000, n()))
# Split data
set.seed(42)
train_idx <- sample(nrow(model_data), 0.8 * nrow(model_data))
train <- model_data[train_idx, ]
test <- model_data[-train_idx, ]
# Train individual models
# 1. Logistic Regression
glm_model <- glm(is_pass ~ ., data = train, family = "binomial")
glm_pred <- predict(glm_model, test, type = "response")
# 2. Random Forest
rf_model <- randomForest(is_pass ~ ., data = train, ntree = 100)
rf_pred <- predict(rf_model, test, type = "prob")[, "pass"]
# 3. Simple rule-based (pass rate by down)
rule_pred <- test %>%
left_join(
train %>% group_by(down) %>% summarize(base_rate = mean(is_pass == "pass")),
by = "down"
) %>%
pull(base_rate)
# Ensemble predictions (average)
ensemble_pred <- (glm_pred + rf_pred + rule_pred) / 3
ensemble_class <- ifelse(ensemble_pred > 0.5, "pass", "run")
# Evaluate each model
actual <- test$is_pass
results <- tibble(
Model = c("Logistic", "Random Forest", "Rule-based", "Ensemble"),
Accuracy = c(
mean((glm_pred > 0.5) == (actual == "pass")),
mean((rf_pred > 0.5) == (actual == "pass")),
mean((rule_pred > 0.5) == (actual == "pass")),
mean(ensemble_class == actual)
)
)
print(results)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
pbp = nfl.import_pbp_data([2023])
# Prepare data
model_data = pbp[
(pbp["play_type"].isin(["pass", "run"])) &
(pbp["down"].notna())
].copy()
model_data["is_pass"] = (model_data["play_type"] == "pass").astype(int)
features = ["down", "ydstogo", "yardline_100", "score_differential", "game_seconds_remaining"]
model_data = model_data[features + ["is_pass"]].dropna()
model_data = model_data.sample(n=min(5000, len(model_data)), random_state=42)
X = model_data[features]
y = model_data["is_pass"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Individual models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
# Train and evaluate individually
results = []
for name, model in [("Logistic", lr), ("Random Forest", rf), ("KNN", knn)]:
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
results.append({"Model": name, "Accuracy": acc})
# Voting Ensemble
voting = VotingClassifier(
estimators=[("lr", lr), ("rf", rf), ("knn", knn)],
voting="soft"
)
voting.fit(X_train, y_train)
results.append({"Model": "Voting Ensemble", "Accuracy": accuracy_score(y_test, voting.predict(X_test))})
# Stacking Ensemble
stacking = StackingClassifier(
estimators=[("lr", LogisticRegression(max_iter=1000)),
("rf", RandomForestClassifier(n_estimators=50))],
final_estimator=LogisticRegression()
)
stacking.fit(X_train, y_train)
results.append({"Model": "Stacking Ensemble", "Accuracy": accuracy_score(y_test, stacking.predict(X_test))})
print("Model Comparison:")
print(pd.DataFrame(results).sort_values("Accuracy", ascending=False))
nflfastR
tidyverse
caret
randomForest
nfl_data_py
pandas
scikit-learn
library(nflfastR)
library(tidyverse)
library(nnet)
schedules <- load_schedules(2020:2023)
pbp <- load_pbp(2020:2023)
# Calculate team features
team_features <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(season, posteam) %>%
summarize(
off_epa = mean(epa),
pass_rate = mean(play_type == "pass"),
.groups = "drop"
) %>%
left_join(
pbp %>%
filter(!is.na(epa)) %>%
group_by(season, defteam) %>%
summarize(def_epa = mean(epa), .groups = "drop"),
by = c("season", "posteam" = "defteam")
)
# Create game dataset
games <- schedules %>%
filter(!is.na(result), !is.na(spread_line)) %>%
left_join(team_features, by = c("season", "home_team" = "posteam")) %>%
rename(home_off = off_epa, home_def = def_epa) %>%
left_join(team_features %>% select(season, posteam, off_epa, def_epa),
by = c("season", "away_team" = "posteam")) %>%
rename(away_off = off_epa, away_def = def_epa) %>%
mutate(
spread_error = result + spread_line # Positive = home beat spread
) %>%
select(home_off, home_def, away_off, away_def, spread_error) %>%
drop_na()
# Scale features
scaled <- scale(games %>% select(-spread_error))
games_scaled <- cbind(as.data.frame(scaled), spread_error = games$spread_error)
# Split data
set.seed(42)
train_idx <- sample(nrow(games_scaled), 0.8 * nrow(games_scaled))
train <- games_scaled[train_idx, ]
test <- games_scaled[-train_idx, ]
# Train neural network
nn_model <- nnet(
spread_error ~ .,
data = train,
size = 10, # Hidden layer size
linout = TRUE, # Linear output for regression
maxit = 500,
decay = 0.01
)
# Predict and evaluate
predictions <- predict(nn_model, test)
mae <- mean(abs(predictions - test$spread_error))
rmse <- sqrt(mean((predictions - test$spread_error)^2))
cat("MAE:", round(mae, 2), "\n")
cat("RMSE:", round(rmse, 2), "\n")
cat("Correct Side %:", round(mean((predictions > 0) == (test$spread_error > 0)) * 100, 1), "%\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])
# Calculate team features
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
team_off = (plays.groupby(["season", "posteam"])
.agg(off_epa=("epa", "mean"), pass_rate=("play_type", lambda x: (x == "pass").mean()))
.reset_index())
team_def = (plays.groupby(["season", "defteam"])
.agg(def_epa=("epa", "mean"))
.reset_index())
team_features = team_off.merge(team_def, left_on=["season", "posteam"],
right_on=["season", "defteam"])
# Create game dataset
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
left_on=["season", "home_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "home_off", "def_epa": "home_def"})
games = games.merge(team_features[["season", "posteam", "off_epa", "def_epa"]],
left_on=["season", "away_team"], right_on=["season", "posteam"])
games = games.rename(columns={"off_epa": "away_off", "def_epa": "away_def"})
games["spread_error"] = games["result"] + games["spread_line"]
features = ["home_off", "home_def", "away_off", "away_def"]
games = games[features + ["spread_error"]].dropna()
# Scale and split
X = games[features]
y = games["spread_error"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train neural network
nn = MLPRegressor(hidden_layer_sizes=(16, 8), max_iter=500, random_state=42, early_stopping=True)
nn.fit(X_train, y_train)
# Evaluate
predictions = nn.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
correct_side = ((predictions > 0) == (y_test > 0)).mean()
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Correct Side %: {correct_side * 100:.1f}%")
nflfastR
tidyverse
nnet
nfl_data_py
pandas
scikit-learn
Analyze play-level data including formations, personnel, and sequencing
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Play type distribution
play_types <- pbp %>%
filter(!is.na(play_type)) %>%
count(play_type) %>%
mutate(pct = n / sum(n) * 100) %>%
arrange(desc(n))
print("Play Type Distribution:")
print(play_types)
# Play type by down
by_down <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(down)) %>%
group_by(down) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
.groups = "drop"
)
print("\nPass Rate by Down:")
print(by_down)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Play type distribution
play_types = (pbp[pbp["play_type"].notna()]
.groupby("play_type")
.size()
.reset_index(name="count")
.sort_values("count", ascending=False))
play_types["pct"] = play_types["count"] / play_types["count"].sum() * 100
print("Play Type Distribution:")
print(play_types)
# Play type by down
by_down = (pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["down"].notna())]
.groupby("down")
.agg(
plays=("play_type", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
)
.reset_index())
print("\nPass Rate by Down:")
print(by_down)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Formation usage
formation_usage <- pbp %>%
filter(!is.na(offense_formation), play_type %in% c("pass", "run")) %>%
group_by(offense_formation) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
epa = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE) * 100,
.groups = "drop"
) %>%
filter(plays >= 100) %>%
arrange(desc(plays))
print(formation_usage)
# Formation by team
team_formations <- pbp %>%
filter(!is.na(offense_formation), play_type %in% c("pass", "run")) %>%
group_by(posteam, offense_formation) %>%
summarize(plays = n(), .groups = "drop") %>%
group_by(posteam) %>%
mutate(pct = plays / sum(plays) * 100) %>%
arrange(posteam, desc(pct))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Formation usage
plays = pbp[(pbp["offense_formation"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
formation_usage = (plays.groupby("offense_formation")
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
formation_usage = formation_usage[formation_usage["plays"] >= 100].sort_values("plays", ascending=False)
print("Formation Usage:")
print(formation_usage)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Personnel grouping analysis
personnel <- pbp %>%
filter(!is.na(offense_personnel), play_type %in% c("pass", "run")) %>%
group_by(offense_personnel) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
epa = mean(epa, na.rm = TRUE),
.groups = "drop"
) %>%
filter(plays >= 200) %>%
arrange(desc(plays))
print("Personnel Grouping Usage:")
print(personnel)
# Team personnel tendencies
team_personnel <- pbp %>%
filter(!is.na(offense_personnel), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
mutate(team_plays = n()) %>%
group_by(posteam, offense_personnel) %>%
summarize(
plays = n(),
pct = n() / first(team_plays) * 100,
.groups = "drop"
) %>%
filter(pct >= 5) %>%
arrange(posteam, desc(pct))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Personnel grouping analysis
plays = pbp[(pbp["offense_personnel"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
personnel = (plays.groupby("offense_personnel")
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
epa=("epa", "mean")
)
.reset_index())
personnel = personnel[personnel["plays"] >= 200].sort_values("plays", ascending=False)
print("Personnel Grouping Usage:")
print(personnel)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Game script analysis
game_script <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(score_differential)) %>%
mutate(
script = case_when(
score_differential >= 14 ~ "Up 14+",
score_differential >= 7 ~ "Up 7-13",
score_differential >= 1 ~ "Up 1-6",
score_differential == 0 ~ "Tied",
score_differential >= -6 ~ "Down 1-6",
score_differential >= -13 ~ "Down 7-13",
TRUE ~ "Down 14+"
),
script = factor(script, levels = c("Down 14+", "Down 7-13", "Down 1-6",
"Tied", "Up 1-6", "Up 7-13", "Up 14+"))
) %>%
group_by(script) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
epa = mean(epa, na.rm = TRUE),
.groups = "drop"
)
print("Game Script Analysis:")
print(game_script)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Game script analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["score_differential"].notna())].copy()
def get_script(diff):
if diff >= 14: return "Up 14+"
elif diff >= 7: return "Up 7-13"
elif diff >= 1: return "Up 1-6"
elif diff == 0: return "Tied"
elif diff >= -6: return "Down 1-6"
elif diff >= -13: return "Down 7-13"
else: return "Down 14+"
plays["script"] = plays["score_differential"].apply(get_script)
game_script = (plays.groupby("script")
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
epa=("epa", "mean")
)
.reset_index())
# Order properly
order = ["Down 14+", "Down 7-13", "Down 1-6", "Tied", "Up 1-6", "Up 7-13", "Up 14+"]
game_script["script"] = pd.Categorical(game_script["script"], categories=order, ordered=True)
game_script = game_script.sort_values("script")
print("Game Script Analysis:")
print(game_script)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# No-huddle analysis
no_huddle <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(no_huddle)) %>%
mutate(no_huddle = if_else(no_huddle == 1, "No-Huddle", "Huddle")) %>%
group_by(no_huddle) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
epa = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE) * 100,
.groups = "drop"
)
print("No-Huddle vs Huddle:")
print(no_huddle)
# Team no-huddle usage
team_no_huddle <- pbp %>%
filter(play_type %in% c("pass", "run"), no_huddle == 1) %>%
group_by(posteam) %>%
summarize(
no_huddle_plays = n(),
epa = mean(epa, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(no_huddle_plays))
print("\nTop No-Huddle Teams:")
print(head(team_no_huddle, 10))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# No-huddle analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["no_huddle"].notna())].copy()
plays["huddle_type"] = plays["no_huddle"].apply(lambda x: "No-Huddle" if x == 1 else "Huddle")
no_huddle = (plays.groupby("huddle_type")
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
print("No-Huddle vs Huddle:")
print(no_huddle)
# Team usage
team_nh = (plays[plays["no_huddle"] == 1]
.groupby("posteam")
.agg(plays=("epa", "count"), epa=("epa", "mean"))
.reset_index()
.sort_values("plays", ascending=False))
print("\nTop No-Huddle Teams:")
print(team_nh.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Shotgun analysis
shotgun_analysis <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(shotgun)) %>%
mutate(formation = if_else(shotgun == 1, "Shotgun", "Under Center")) %>%
group_by(formation) %>%
summarize(
plays = n(),
pct = n() / nrow(.) * 100,
pass_rate = mean(play_type == "pass") * 100,
epa = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE) * 100,
.groups = "drop"
)
print("Shotgun vs Under Center:")
print(shotgun_analysis)
# By play type
by_play_type <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(shotgun)) %>%
mutate(formation = if_else(shotgun == 1, "Shotgun", "Under Center")) %>%
group_by(formation, play_type) %>%
summarize(
plays = n(),
epa = mean(epa, na.rm = TRUE),
.groups = "drop"
)
print("\nBy Play Type:")
print(by_play_type)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Shotgun analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["shotgun"].notna())].copy()
plays["formation"] = plays["shotgun"].apply(lambda x: "Shotgun" if x == 1 else "Under Center")
shotgun_analysis = (plays.groupby("formation")
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
shotgun_analysis["pct"] = shotgun_analysis["plays"] / shotgun_analysis["plays"].sum() * 100
print("Shotgun vs Under Center:")
print(shotgun_analysis)
# By play type
by_type = (plays.groupby(["formation", "play_type"])
.agg(plays=("epa", "count"), epa=("epa", "mean"))
.reset_index())
print("\nBy Play Type:")
print(by_type)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Motion analysis
motion_analysis <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(is_motion)) %>%
mutate(motion = if_else(is_motion == 1, "Motion", "No Motion")) %>%
group_by(motion) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
epa = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE) * 100,
.groups = "drop"
)
print("Motion Impact on Play Success:")
print(motion_analysis)
# Team motion usage
team_motion <- pbp %>%
filter(play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(
total_plays = n(),
motion_plays = sum(is_motion == 1, na.rm = TRUE),
motion_rate = mean(is_motion == 1, na.rm = TRUE) * 100,
motion_epa = mean(epa[is_motion == 1], na.rm = TRUE),
no_motion_epa = mean(epa[is_motion != 1 | is.na(is_motion)], na.rm = TRUE),
.groups = "drop"
) %>%
mutate(motion_advantage = motion_epa - no_motion_epa) %>%
arrange(desc(motion_rate))
print("\nTeam Motion Usage:")
print(head(team_motion, 10))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Motion analysis
plays = pbp[(pbp["play_type"].isin(["pass", "run"]))].copy()
motion = plays[plays["is_motion"] == 1]
no_motion = plays[(plays["is_motion"] != 1) | (plays["is_motion"].isna())]
print("Motion Impact on Play Success:")
print(f"With Motion: {len(motion)} plays, EPA: {motion[\"epa\"].mean():.3f}")
print(f"No Motion: {len(no_motion)} plays, EPA: {no_motion[\"epa\"].mean():.3f}")
# Team motion usage
def calc_motion_stats(group):
motion_plays = group[group["is_motion"] == 1]
return pd.Series({
"total_plays": len(group),
"motion_plays": len(motion_plays),
"motion_rate": len(motion_plays) / len(group) * 100 if len(group) > 0 else 0,
"motion_epa": motion_plays["epa"].mean() if len(motion_plays) > 0 else 0
})
team_motion = plays.groupby("posteam").apply(calc_motion_stats).reset_index()
team_motion = team_motion.sort_values("motion_rate", ascending=False)
print("\nTeam Motion Usage:")
print(team_motion.head(10))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Add previous play info
play_sequence <- pbp %>%
filter(play_type %in% c("pass", "run")) %>%
group_by(game_id, posteam) %>%
arrange(play_id) %>%
mutate(
prev_play = lag(play_type),
prev_success = lag(success),
prev_epa = lag(epa)
) %>%
ungroup() %>%
filter(!is.na(prev_play))
# Play calling after success/failure
after_result <- play_sequence %>%
mutate(prev_result = if_else(prev_success == 1, "After Success", "After Failure")) %>%
group_by(prev_result) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
epa = mean(epa, na.rm = TRUE),
.groups = "drop"
)
print("Play Calling After Success/Failure:")
print(after_result)
# Run-Run, Run-Pass, Pass-Run, Pass-Pass sequences
sequences <- play_sequence %>%
mutate(sequence = paste(prev_play, play_type, sep = " -> ")) %>%
group_by(sequence) %>%
summarize(
plays = n(),
epa = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE) * 100,
.groups = "drop"
) %>%
arrange(desc(plays))
print("\nPlay Sequences:")
print(sequences)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter and sort
plays = pbp[pbp["play_type"].isin(["pass", "run"])].copy()
plays = plays.sort_values(["game_id", "play_id"])
# Add previous play info
plays["prev_play"] = plays.groupby(["game_id", "posteam"])["play_type"].shift(1)
plays["prev_success"] = plays.groupby(["game_id", "posteam"])["success"].shift(1)
plays = plays[plays["prev_play"].notna()]
# After success/failure
plays["prev_result"] = plays["prev_success"].apply(
lambda x: "After Success" if x == 1 else "After Failure")
after_result = (plays.groupby("prev_result")
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100),
epa=("epa", "mean")
)
.reset_index())
print("Play Calling After Success/Failure:")
print(after_result)
# Sequences
plays["sequence"] = plays["prev_play"] + " -> " + plays["play_type"]
sequences = (plays.groupby("sequence")
.agg(
plays=("epa", "count"),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index()
.sort_values("plays", ascending=False))
print("\nPlay Sequences:")
print(sequences)
nflfastR
tidyverse
nfl_data_py
pandas
Analyze decision-making in key game situations
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Fourth down decisions
fourth_downs <- pbp %>%
filter(down == 4, !is.na(play_type)) %>%
mutate(
decision = case_when(
play_type %in% c("pass", "run") ~ "Go for it",
play_type == "field_goal" ~ "Field Goal",
play_type == "punt" ~ "Punt",
TRUE ~ "Other"
)
) %>%
filter(decision != "Other")
# Decision by field position
decision_analysis <- fourth_downs %>%
mutate(
zone = case_when(
yardline_100 <= 3 ~ "Goal line (1-3)",
yardline_100 <= 10 ~ "Red zone (4-10)",
yardline_100 <= 40 ~ "Opp territory",
yardline_100 <= 60 ~ "Midfield",
TRUE ~ "Own territory"
)
) %>%
group_by(zone, decision) %>%
summarize(plays = n(), .groups = "drop") %>%
pivot_wider(names_from = decision, values_from = plays, values_fill = 0)
print("Fourth Down Decisions by Field Position:")
print(decision_analysis)
# Go-for-it success rate
go_for_it <- fourth_downs %>%
filter(decision == "Go for it") %>%
summarize(
attempts = n(),
conversions = sum(first_down == 1 | touchdown == 1, na.rm = TRUE),
success_rate = mean(first_down == 1 | touchdown == 1, na.rm = TRUE) * 100
)
print("\nGo-for-it Success Rate:")
print(go_for_it)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Fourth down decisions
fourth = pbp[(pbp["down"] == 4) & (pbp["play_type"].notna())].copy()
def get_decision(play_type):
if play_type in ["pass", "run"]: return "Go for it"
elif play_type == "field_goal": return "Field Goal"
elif play_type == "punt": return "Punt"
else: return "Other"
fourth["decision"] = fourth["play_type"].apply(get_decision)
fourth = fourth[fourth["decision"] != "Other"]
# Decision by field position
def get_zone(yd):
if yd <= 3: return "Goal line (1-3)"
elif yd <= 10: return "Red zone (4-10)"
elif yd <= 40: return "Opp territory"
elif yd <= 60: return "Midfield"
else: return "Own territory"
fourth["zone"] = fourth["yardline_100"].apply(get_zone)
decision_analysis = (fourth.groupby(["zone", "decision"])
.size()
.reset_index(name="plays")
.pivot(index="zone", columns="decision", values="plays")
.fillna(0))
print("Fourth Down Decisions by Field Position:")
print(decision_analysis)
# Go-for-it success rate
go_for_it = fourth[fourth["decision"] == "Go for it"]
success_rate = ((go_for_it["first_down"] == 1) | (go_for_it["touchdown"] == 1)).mean() * 100
print(f"\nGo-for-it Success Rate: {success_rate:.1f}%")
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2019:2023)
# Two-point conversion analysis
two_point <- pbp %>%
filter(two_point_attempt == 1)
# Overall success rate
overall <- two_point %>%
summarize(
attempts = n(),
successes = sum(two_point_conv_result == "success"),
success_rate = mean(two_point_conv_result == "success") * 100
)
print("Overall Two-Point Conversion Success:")
print(overall)
# By play type
by_type <- two_point %>%
filter(play_type %in% c("pass", "run")) %>%
group_by(play_type) %>%
summarize(
attempts = n(),
successes = sum(two_point_conv_result == "success", na.rm = TRUE),
success_rate = mean(two_point_conv_result == "success", na.rm = TRUE) * 100
)
print("\nBy Play Type:")
print(by_type)
# Team leaders
team_2pt <- two_point %>%
group_by(posteam) %>%
summarize(
attempts = n(),
successes = sum(two_point_conv_result == "success", na.rm = TRUE),
success_rate = mean(two_point_conv_result == "success", na.rm = TRUE) * 100
) %>%
filter(attempts >= 5) %>%
arrange(desc(success_rate))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2019, 2020, 2021, 2022, 2023])
# Two-point conversions
two_point = pbp[pbp["two_point_attempt"] == 1].copy()
# Overall success rate
success = (two_point["two_point_conv_result"] == "success")
print(f"Overall 2PT Success Rate: {success.mean() * 100:.1f}% ({success.sum()}/{len(two_point)})")
# By play type
by_type = (two_point[two_point["play_type"].isin(["pass", "run"])]
.groupby("play_type")
.agg(
attempts=("two_point_conv_result", "count"),
successes=("two_point_conv_result", lambda x: (x == "success").sum()),
success_rate=("two_point_conv_result", lambda x: (x == "success").mean() * 100)
)
.reset_index())
print("\nBy Play Type:")
print(by_type)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Late game clock management (4th quarter, close game)
late_game <- pbp %>%
filter(
qtr == 4,
game_seconds_remaining <= 300, # Last 5 minutes
abs(score_differential) <= 8,
play_type %in% c("pass", "run")
)
# Play selection by score differential
late_plays <- late_game %>%
mutate(
situation = case_when(
score_differential > 0 ~ "Leading",
score_differential < 0 ~ "Trailing",
TRUE ~ "Tied"
)
) %>%
group_by(situation) %>%
summarize(
plays = n(),
pass_rate = mean(play_type == "pass") * 100,
avg_play_clock = mean(play_clock, na.rm = TRUE),
.groups = "drop"
)
print("Late Game Play Selection:")
print(late_plays)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Late game clock management
late_game = pbp[
(pbp["qtr"] == 4) &
(pbp["game_seconds_remaining"] <= 300) &
(pbp["score_differential"].abs() <= 8) &
(pbp["play_type"].isin(["pass", "run"]))
].copy()
def get_situation(diff):
if diff > 0: return "Leading"
elif diff < 0: return "Trailing"
else: return "Tied"
late_game["situation"] = late_game["score_differential"].apply(get_situation)
late_plays = (late_game.groupby("situation")
.agg(
plays=("epa", "count"),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
)
.reset_index())
print("Late Game Play Selection:")
print(late_plays)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Timeout usage by situation
timeouts <- pbp %>%
filter(timeout == 1) %>%
mutate(
half = if_else(qtr <= 2, "First Half", "Second Half"),
close_game = abs(score_differential) <= 8
)
# Timeout distribution
timeout_dist <- timeouts %>%
group_by(half, qtr) %>%
summarize(
timeouts = n(),
.groups = "drop"
)
print("Timeout Distribution by Quarter:")
print(timeout_dist)
# Team timeout usage patterns
team_timeouts <- timeouts %>%
group_by(timeout_team) %>%
summarize(
total_timeouts = n(),
first_half = sum(half == "First Half"),
second_half = sum(half == "Second Half"),
close_games = sum(close_game),
avg_time_remaining = mean(game_seconds_remaining, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(total_timeouts))
print("\nTeam Timeout Usage:")
print(team_timeouts)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Filter timeouts
timeouts = pbp[pbp["timeout"] == 1].copy()
timeouts["half"] = timeouts["qtr"].apply(lambda x: "First Half" if x <= 2 else "Second Half")
timeouts["close_game"] = timeouts["score_differential"].abs() <= 8
# Distribution by quarter
timeout_dist = (timeouts.groupby(["half", "qtr"])
.size()
.reset_index(name="timeouts"))
print("Timeout Distribution by Quarter:")
print(timeout_dist)
# Team patterns
team_timeouts = (timeouts.groupby("timeout_team")
.agg(
total_timeouts=("timeout", "count"),
first_half=("half", lambda x: (x == "First Half").sum()),
second_half=("half", lambda x: (x == "Second Half").sum()),
close_games=("close_game", "sum"),
avg_time_remaining=("game_seconds_remaining", "mean")
)
.reset_index()
.sort_values("total_timeouts", ascending=False))
print("\nTeam Timeout Usage:")
print(team_timeouts)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2019:2023)
# Replay review analysis
replays <- pbp %>%
filter(!is.na(replay_or_challenge_result))
# Overall success rate
success_rate <- replays %>%
mutate(overturned = replay_or_challenge_result == "overturned") %>%
summarize(
total_reviews = n(),
overturned = sum(overturned),
success_rate = mean(overturned) * 100
)
print("Overall Challenge Results:")
print(success_rate)
# By year
by_year <- replays %>%
mutate(
overturned = replay_or_challenge_result == "overturned",
year = season
) %>%
group_by(year) %>%
summarize(
reviews = n(),
overturned = sum(overturned),
rate = mean(overturned) * 100,
.groups = "drop"
)
print("\nChallenge Success by Year:")
print(by_year)
# By play type challenged
by_type <- replays %>%
mutate(overturned = replay_or_challenge_result == "overturned") %>%
group_by(play_type) %>%
summarize(
reviews = n(),
rate = mean(overturned) * 100,
.groups = "drop"
) %>%
filter(reviews >= 10) %>%
arrange(desc(rate))
print("\nSuccess Rate by Play Type:")
print(by_type)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2019, 2020, 2021, 2022, 2023])
# Replay reviews
replays = pbp[pbp["replay_or_challenge_result"].notna()].copy()
replays["overturned"] = replays["replay_or_challenge_result"] == "overturned"
# Overall success rate
total = len(replays)
overturned = replays["overturned"].sum()
print(f"Overall Challenge Success: {overturned}/{total} ({overturned/total*100:.1f}%)")
# By year
by_year = (replays.groupby("season")
.agg(
reviews=("overturned", "count"),
overturned=("overturned", "sum"),
rate=("overturned", lambda x: x.mean() * 100)
)
.reset_index())
print("\nChallenge Success by Year:")
print(by_year)
# By play type
by_type = (replays.groupby("play_type")
.agg(
reviews=("overturned", "count"),
rate=("overturned", lambda x: x.mean() * 100)
)
.reset_index())
by_type = by_type[by_type["reviews"] >= 10].sort_values("rate", ascending=False)
print("\nSuccess Rate by Play Type:")
print(by_type)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Penalty analysis
penalties <- pbp %>%
filter(penalty == 1)
# Most common penalties
common_penalties <- penalties %>%
filter(!is.na(penalty_type)) %>%
count(penalty_type) %>%
arrange(desc(n)) %>%
head(15)
print("Most Common Penalties:")
print(common_penalties)
# Team penalty rates
team_penalties <- pbp %>%
filter(play_type %in% c("pass", "run", "no_play")) %>%
group_by(posteam) %>%
summarize(
plays = n(),
penalties = sum(penalty == 1 & penalty_team == posteam, na.rm = TRUE),
penalty_rate = mean(penalty == 1 & penalty_team == posteam, na.rm = TRUE) * 100,
penalty_yards = sum(penalty_yards[penalty_team == posteam], na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(penalty_rate))
print("\nTeam Penalty Rates:")
print(team_penalties)
# Penalty impact on EPA
penalty_impact <- pbp %>%
filter(play_type %in% c("pass", "run")) %>%
mutate(had_penalty = penalty == 1) %>%
group_by(had_penalty) %>%
summarize(
plays = n(),
avg_epa = mean(epa, na.rm = TRUE),
.groups = "drop"
)
print("\nEPA Impact of Penalties:")
print(penalty_impact)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Penalty analysis
penalties = pbp[pbp["penalty"] == 1]
# Most common
common = (penalties[penalties["penalty_type"].notna()]
.groupby("penalty_type")
.size()
.reset_index(name="count")
.sort_values("count", ascending=False)
.head(15))
print("Most Common Penalties:")
print(common)
# Team penalty rates
plays = pbp[pbp["play_type"].isin(["pass", "run", "no_play"])].copy()
def calc_team_penalties(group):
team = group.name
team_penalties = group[(group["penalty"] == 1) & (group["penalty_team"] == team)]
return pd.Series({
"plays": len(group),
"penalties": len(team_penalties),
"penalty_rate": len(team_penalties) / len(group) * 100 if len(group) > 0 else 0,
"penalty_yards": team_penalties["penalty_yards"].sum()
})
team_penalties = plays.groupby("posteam").apply(calc_team_penalties).reset_index()
team_penalties = team_penalties.sort_values("penalty_rate", ascending=False)
print("\nTeam Penalty Rates:")
print(team_penalties)
# EPA impact
plays["had_penalty"] = plays["penalty"] == 1
penalty_impact = (plays.groupby("had_penalty")
.agg(plays=("epa", "count"), avg_epa=("epa", "mean"))
.reset_index())
print("\nEPA Impact of Penalties:")
print(penalty_impact)
nflfastR
tidyverse
nfl_data_py
pandas
Comprehensive team-level analysis and comparisons
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate weekly EPA for each team
weekly_epa <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam, week) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
)
# Calculate cumulative and rolling metrics
team_trends <- weekly_epa %>%
group_by(posteam) %>%
arrange(week) %>%
mutate(
cumulative_epa = cumsum(epa_per_play) / row_number(),
rolling_3wk = zoo::rollmean(epa_per_play, k = 3, fill = NA, align = "right")
) %>%
ungroup()
# Find improving and declining teams
week_1_3 <- team_trends %>%
filter(week <= 3) %>%
group_by(posteam) %>%
summarize(early_epa = mean(epa_per_play))
week_15_plus <- team_trends %>%
filter(week >= 15) %>%
group_by(posteam) %>%
summarize(late_epa = mean(epa_per_play))
trends <- week_1_3 %>%
inner_join(week_15_plus, by = "posteam") %>%
mutate(improvement = late_epa - early_epa) %>%
arrange(desc(improvement))
print("Most Improved Teams:")
print(trends %>% head(5))
print("\nDeclined Most:")
print(trends %>% tail(5))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Weekly EPA
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
weekly_epa = (plays.groupby(["posteam", "week"])
.agg(
plays=("epa", "count"),
epa_per_play=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
# Compare early vs late season
early = weekly_epa[weekly_epa["week"] <= 3].groupby("posteam")["epa_per_play"].mean()
late = weekly_epa[weekly_epa["week"] >= 15].groupby("posteam")["epa_per_play"].mean()
trends = pd.DataFrame({"early_epa": early, "late_epa": late})
trends["improvement"] = trends["late_epa"] - trends["early_epa"]
trends = trends.sort_values("improvement", ascending=False).reset_index()
print("Most Improved Teams:")
print(trends.head(5))
print("\nDeclined Most:")
print(trends.tail(5))
nflfastR
tidyverse
zoo
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Home vs Away EPA
home_away <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
mutate(location = if_else(posteam == home_team, "Home", "Away")) %>%
group_by(posteam, location) %>%
summarize(
plays = n(),
epa = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
pivot_wider(
names_from = location,
values_from = c(plays, epa, success_rate)
) %>%
mutate(
home_advantage = epa_Home - epa_Away
) %>%
arrange(desc(home_advantage))
print("Home vs Away Performance:")
print(home_away %>% select(posteam, epa_Home, epa_Away, home_advantage))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Home vs Away
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))].copy()
plays["location"] = plays.apply(lambda x: "Home" if x["posteam"] == x["home_team"] else "Away", axis=1)
home_away = (plays.groupby(["posteam", "location"])
.agg(
plays=("epa", "count"),
epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
)
.reset_index())
# Pivot
home = home_away[home_away["location"] == "Home"].set_index("posteam")
away = home_away[home_away["location"] == "Away"].set_index("posteam")
comparison = pd.DataFrame({
"Home EPA": home["epa"],
"Away EPA": away["epa"]
})
comparison["Home Advantage"] = comparison["Home EPA"] - comparison["Away EPA"]
comparison = comparison.sort_values("Home Advantage", ascending=False).reset_index()
print("Home vs Away Performance:")
print(comparison)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Calculate point differential stats
point_diff <- schedules %>%
filter(!is.na(result)) %>%
pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
mutate(
points_for = if_else(location == "home_team", home_score, away_score),
points_against = if_else(location == "home_team", away_score, home_score),
point_diff = points_for - points_against,
win = point_diff > 0
) %>%
group_by(team) %>%
summarize(
games = n(),
wins = sum(win),
total_pf = sum(points_for),
total_pa = sum(points_against),
total_diff = sum(point_diff),
ppg = mean(points_for),
papg = mean(points_against),
avg_diff = mean(point_diff),
.groups = "drop"
) %>%
arrange(desc(total_diff))
print("Point Differential Rankings:")
print(point_diff)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2023])
# Calculate for each team
games = schedules[schedules["result"].notna()].copy()
# Home games
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "points_for", "points_against"]
# Away games
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "points_for", "points_against"]
all_games = pd.concat([home, away])
all_games["point_diff"] = all_games["points_for"] - all_games["points_against"]
all_games["win"] = all_games["point_diff"] > 0
point_diff = (all_games.groupby("team")
.agg(
games=("point_diff", "count"),
wins=("win", "sum"),
total_pf=("points_for", "sum"),
total_pa=("points_against", "sum"),
total_diff=("point_diff", "sum"),
ppg=("points_for", "mean"),
avg_diff=("point_diff", "mean")
)
.reset_index()
.sort_values("total_diff", ascending=False))
print("Point Differential Rankings:")
print(point_diff)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Calculate team scoring
team_scoring <- schedules %>%
filter(!is.na(result)) %>%
pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
mutate(
pf = if_else(location == "home_team", home_score, away_score),
pa = if_else(location == "home_team", away_score, home_score),
win = pf > pa
) %>%
group_by(team) %>%
summarize(
games = n(),
actual_wins = sum(win),
points_for = sum(pf),
points_against = sum(pa),
.groups = "drop"
)
# Pythagorean expectation (exponent = 2.37 for NFL)
team_scoring <- team_scoring %>%
mutate(
pyth_exp = points_for^2.37 / (points_for^2.37 + points_against^2.37),
expected_wins = pyth_exp * games,
luck = actual_wins - expected_wins
) %>%
arrange(desc(luck))
print("Pythagorean Win Expectation:")
print(team_scoring %>%
select(team, actual_wins, expected_wins, luck) %>%
mutate(across(c(expected_wins, luck), ~round(., 1))))
import nfl_data_py as nfl
import pandas as pd
import numpy as np
schedules = nfl.import_schedules([2023])
games = schedules[schedules["result"].notna()].copy()
# Home games
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "pf", "pa"]
# Away games
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "pf", "pa"]
all_games = pd.concat([home, away])
all_games["win"] = all_games["pf"] > all_games["pa"]
# Aggregate
team_scoring = (all_games.groupby("team")
.agg(
games=("win", "count"),
actual_wins=("win", "sum"),
points_for=("pf", "sum"),
points_against=("pa", "sum")
)
.reset_index())
# Pythagorean expectation
exp = 2.37
team_scoring["pyth_exp"] = (team_scoring["points_for"]**exp /
(team_scoring["points_for"]**exp + team_scoring["points_against"]**exp))
team_scoring["expected_wins"] = team_scoring["pyth_exp"] * team_scoring["games"]
team_scoring["luck"] = team_scoring["actual_wins"] - team_scoring["expected_wins"]
team_scoring = team_scoring.sort_values("luck", ascending=False)
print("Pythagorean Win Expectation:")
print(team_scoring[["team", "actual_wins", "expected_wins", "luck"]].round(1))
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
teams <- load_teams()
# Add division info
games <- schedules %>%
filter(!is.na(result)) %>%
left_join(teams %>% select(team_abbr, team_division),
by = c("home_team" = "team_abbr")) %>%
rename(home_div = team_division) %>%
left_join(teams %>% select(team_abbr, team_division),
by = c("away_team" = "team_abbr")) %>%
rename(away_div = team_division) %>%
mutate(division_game = home_div == away_div)
# Create team-game records
home <- games %>%
select(team = home_team, opponent = away_team, pf = home_score,
pa = away_score, division_game)
away <- games %>%
select(team = away_team, opponent = home_team, pf = away_score,
pa = home_score, division_game)
all_games <- bind_rows(home, away) %>%
mutate(
win = pf > pa,
margin = pf - pa
)
# Division vs Non-Division performance
div_perf <- all_games %>%
group_by(team, division_game) %>%
summarize(
games = n(),
wins = sum(win),
win_pct = mean(win) * 100,
avg_margin = mean(margin),
.groups = "drop"
) %>%
pivot_wider(
names_from = division_game,
values_from = c(games, wins, win_pct, avg_margin),
names_prefix = "div_"
)
print("Division vs Non-Division Performance:")
print(div_perf %>%
select(team, win_pct_div_TRUE, win_pct_div_FALSE) %>%
mutate(div_advantage = win_pct_div_TRUE - win_pct_div_FALSE) %>%
arrange(desc(div_advantage)))
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2023])
teams = nfl.import_team_desc()
# Add division info
games = schedules[schedules["result"].notna()].copy()
games = games.merge(teams[["team_abbr", "team_division"]],
left_on="home_team", right_on="team_abbr", how="left")
games = games.rename(columns={"team_division": "home_div"}).drop("team_abbr", axis=1)
games = games.merge(teams[["team_abbr", "team_division"]],
left_on="away_team", right_on="team_abbr", how="left")
games = games.rename(columns={"team_division": "away_div"}).drop("team_abbr", axis=1)
games["division_game"] = games["home_div"] == games["away_div"]
# Create team-game records
home = games[["home_team", "away_team", "home_score", "away_score", "division_game"]].copy()
home.columns = ["team", "opponent", "pf", "pa", "division_game"]
away = games[["away_team", "home_team", "away_score", "home_score", "division_game"]].copy()
away.columns = ["team", "opponent", "pf", "pa", "division_game"]
all_games = pd.concat([home, away])
all_games["win"] = all_games["pf"] > all_games["pa"]
all_games["margin"] = all_games["pf"] - all_games["pa"]
# Division vs Non-Division
div_perf = (all_games.groupby(["team", "division_game"])
.agg(games=("win", "count"), wins=("win", "sum"),
win_pct=("win", lambda x: x.mean() * 100), avg_margin=("margin", "mean"))
.reset_index())
div_games = div_perf[div_perf["division_game"]].set_index("team")["win_pct"]
non_div = div_perf[~div_perf["division_game"]].set_index("team")["win_pct"]
comparison = pd.DataFrame({"Division": div_games, "Non-Division": non_div})
comparison["Advantage"] = comparison["Division"] - comparison["Non-Division"]
comparison = comparison.sort_values("Advantage", ascending=False).reset_index()
print("Division vs Non-Division Performance:")
print(comparison)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load multiple years for playoff sample
pbp <- load_pbp(2020:2023)
# Separate regular season and playoffs
perf_comparison <- pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(epa)) %>%
mutate(game_type = if_else(week <= 18, "Regular Season", "Playoffs")) %>%
group_by(posteam, game_type) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
success_rate = mean(success, na.rm = TRUE) * 100,
pass_rate = mean(play_type == "pass") * 100,
.groups = "drop"
) %>%
filter(plays >= 100) # Minimum plays threshold
# Compare teams with playoff experience
playoff_teams <- perf_comparison %>%
filter(game_type == "Playoffs") %>%
pull(posteam) %>%
unique()
comparison <- perf_comparison %>%
filter(posteam %in% playoff_teams) %>%
select(posteam, game_type, epa_per_play) %>%
pivot_wider(names_from = game_type, values_from = epa_per_play) %>%
mutate(
playoff_bump = Playoffs - `Regular Season`
) %>%
arrange(desc(playoff_bump))
print("Playoff vs Regular Season EPA (2020-2023):")
print(comparison)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2020, 2021, 2022, 2023])
# Filter and categorize
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["epa"].notna())].copy()
plays["game_type"] = plays["week"].apply(lambda x: "Regular Season" if x <= 18 else "Playoffs")
# Performance by game type
perf = (plays.groupby(["posteam", "game_type"])
.agg(
plays=("epa", "count"),
epa_per_play=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
)
.reset_index())
perf = perf[perf["plays"] >= 100]
# Teams with playoff experience
playoff_teams = perf[perf["game_type"] == "Playoffs"]["posteam"].unique()
comparison = perf[perf["posteam"].isin(playoff_teams)].copy()
# Pivot
reg = comparison[comparison["game_type"] == "Regular Season"].set_index("posteam")["epa_per_play"]
playoff = comparison[comparison["game_type"] == "Playoffs"].set_index("posteam")["epa_per_play"]
result = pd.DataFrame({"Regular Season": reg, "Playoffs": playoff})
result["Playoff Bump"] = result["Playoffs"] - result["Regular Season"]
result = result.sort_values("Playoff Bump", ascending=False).reset_index()
print("Playoff vs Regular Season EPA (2020-2023):")
print(result)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Calculate team records
games <- schedules %>%
filter(!is.na(result))
home <- games %>%
select(team = home_team, pf = home_score, pa = away_score) %>%
mutate(win = pf > pa)
away <- games %>%
select(team = away_team, pf = away_score, pa = home_score) %>%
mutate(win = pf > pa)
records <- bind_rows(home, away) %>%
group_by(team) %>%
summarize(
wins = sum(win),
losses = sum(!win),
win_pct = mean(win),
.groups = "drop"
)
# Calculate SOS (average win pct of opponents)
calculate_sos <- function(team_name) {
opponents <- c(
games$away_team[games$home_team == team_name],
games$home_team[games$away_team == team_name]
)
opp_records <- records %>%
filter(team %in% opponents)
mean(opp_records$win_pct)
}
sos <- records %>%
rowwise() %>%
mutate(
sos = calculate_sos(team),
sos_rank = NA
) %>%
ungroup() %>%
mutate(sos_rank = rank(-sos))
print("Strength of Schedule Rankings:")
print(sos %>%
select(team, wins, losses, win_pct, sos) %>%
arrange(desc(sos)) %>%
mutate(
sos = round(sos, 3),
rank = row_number()
))
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2023])
games = schedules[schedules["result"].notna()].copy()
# Calculate team records
home = games[["home_team", "home_score", "away_score"]].copy()
home.columns = ["team", "pf", "pa"]
home["win"] = home["pf"] > home["pa"]
away = games[["away_team", "away_score", "home_score"]].copy()
away.columns = ["team", "pf", "pa"]
away["win"] = away["pf"] > away["pa"]
all_games = pd.concat([home, away])
records = (all_games.groupby("team")
.agg(wins=("win", "sum"), losses=("win", lambda x: (~x).sum()),
win_pct=("win", "mean"))
.reset_index())
# Calculate SOS
def calc_sos(team):
home_opps = games[games["home_team"] == team]["away_team"].tolist()
away_opps = games[games["away_team"] == team]["home_team"].tolist()
opponents = home_opps + away_opps
opp_records = records[records["team"].isin(opponents)]
return opp_records["win_pct"].mean()
records["sos"] = records["team"].apply(calc_sos)
records = records.sort_values("sos", ascending=False)
records["rank"] = range(1, len(records) + 1)
print("Strength of Schedule Rankings:")
print(records[["rank", "team", "wins", "losses", "sos"]].round(3))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
# Load two seasons
pbp_prev <- load_pbp(2022)
pbp_curr <- load_pbp(2023)
# Calculate EPA per play by team for each year
calc_team_epa <- function(pbp, year) {
pbp %>%
filter(play_type %in% c("pass", "run"), !is.na(epa)) %>%
group_by(team = posteam) %>%
summarize(
plays = n(),
epa_per_play = mean(epa),
success_rate = mean(success, na.rm = TRUE) * 100,
pass_rate = mean(play_type == "pass") * 100,
.groups = "drop"
) %>%
mutate(season = year)
}
epa_2022 <- calc_team_epa(pbp_prev, 2022)
epa_2023 <- calc_team_epa(pbp_curr, 2023)
# Compare
improvement <- epa_2022 %>%
select(team, epa_2022 = epa_per_play, sr_2022 = success_rate) %>%
inner_join(
epa_2023 %>% select(team, epa_2023 = epa_per_play, sr_2023 = success_rate),
by = "team"
) %>%
mutate(
epa_change = epa_2023 - epa_2022,
sr_change = sr_2023 - sr_2022
) %>%
arrange(desc(epa_change))
print("Year-over-Year EPA Change (2022 to 2023):")
print(improvement %>%
mutate(across(where(is.numeric), ~round(., 3))))
import nfl_data_py as nfl
import pandas as pd
# Load two seasons
pbp_2022 = nfl.import_pbp_data([2022])
pbp_2023 = nfl.import_pbp_data([2023])
def calc_team_epa(pbp, year):
plays = pbp[(pbp["play_type"].isin(["pass", "run"])) & (pbp["epa"].notna())]
return (plays.groupby("posteam")
.agg(
plays=("epa", "count"),
epa_per_play=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100),
pass_rate=("play_type", lambda x: (x == "pass").mean() * 100)
)
.reset_index()
.rename(columns={"posteam": "team"})
.assign(season=year))
epa_2022 = calc_team_epa(pbp_2022, 2022)
epa_2023 = calc_team_epa(pbp_2023, 2023)
# Merge and compare
comparison = epa_2022[["team", "epa_per_play", "success_rate"]].merge(
epa_2023[["team", "epa_per_play", "success_rate"]],
on="team",
suffixes=("_2022", "_2023")
)
comparison["epa_change"] = comparison["epa_per_play_2023"] - comparison["epa_per_play_2022"]
comparison["sr_change"] = comparison["success_rate_2023"] - comparison["success_rate_2022"]
comparison = comparison.sort_values("epa_change", ascending=False)
print("Year-over-Year EPA Change (2022 to 2023):")
print(comparison.round(3))
nflfastR
tidyverse
nfl_data_py
pandas
nflfastR - Play-by-play data with EPAnflplotR - NFL team logos & plottingtidyverse - Data manipulation & visualizationggplot2 - Advanced visualizationsnfl_data_py - NFL data (nflverse compatible)pandas - Data manipulationmatplotlib - Visualizationsscikit-learn - Machine learningLearn the theory behind these techniques in our comprehensive tutorial series
Browse Tutorials