Copy-paste ready R and Python code for NFL analytics. From data loading to machine learning models.
Build predictive models for spreads, totals, and player props
library(nflfastR)
library(tidyverse)
# Load data
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate team ratings
team_ratings <- pbp %>%
filter(!is.na(epa)) %>%
group_by(posteam) %>%
summarize(off_epa = mean(epa)) %>%
left_join(
pbp %>%
filter(!is.na(epa)) %>%
group_by(defteam) %>%
summarize(def_epa = mean(epa)),
by = c("posteam" = "defteam")
) %>%
mutate(
net_epa = off_epa - def_epa,
power_rating = net_epa * 3.5 # Convert to points
)
# Create matchup predictions
games <- schedules %>%
filter(!is.na(result)) %>%
left_join(team_ratings, by = c("home_team" = "posteam")) %>%
rename(home_power = power_rating) %>%
left_join(team_ratings %>% select(posteam, power_rating),
by = c("away_team" = "posteam")) %>%
rename(away_power = power_rating) %>%
mutate(
pred_spread = away_power - home_power - 2.5, # HFA
actual_spread = -result,
error = pred_spread - actual_spread,
correct_side = (pred_spread > 0 & result < 0) |
(pred_spread < 0 & result > 0)
)
# Model performance
cat("Mean Absolute Error:", mean(abs(games$error)), "\n")
cat("Correct Side %:", mean(games$correct_side) * 100, "%\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
# Load data
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate team ratings
plays = pbp[pbp["epa"].notna()]
off_epa = plays.groupby("posteam")["epa"].mean().reset_index()
off_epa.columns = ["team", "off_epa"]
def_epa = plays.groupby("defteam")["epa"].mean().reset_index()
def_epa.columns = ["team", "def_epa"]
team_ratings = off_epa.merge(def_epa, on="team")
team_ratings["net_epa"] = team_ratings["off_epa"] - team_ratings["def_epa"]
team_ratings["power_rating"] = team_ratings["net_epa"] * 3.5
# Create matchup predictions
games = schedules[schedules["result"].notna()].copy()
games = games.merge(
team_ratings[["team", "power_rating"]],
left_on="home_team", right_on="team"
).rename(columns={"power_rating": "home_power"})
games = games.merge(
team_ratings[["team", "power_rating"]],
left_on="away_team", right_on="team"
).rename(columns={"power_rating": "away_power"})
games["pred_spread"] = games["away_power"] - games["home_power"] - 2.5
games["actual_spread"] = -games["result"]
games["error"] = games["pred_spread"] - games["actual_spread"]
games["correct_side"] = ((games["pred_spread"] > 0) & (games["result"] < 0)) | \
((games["pred_spread"] < 0) & (games["result"] > 0))
print(f"Mean Absolute Error: {games['error'].abs().mean():.2f}")
print(f"Correct Side %: {games['correct_side'].mean() * 100:.1f}%")
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate team scoring metrics
team_scoring <- pbp %>%
filter(!is.na(epa)) %>%
group_by(game_id, posteam) %>%
summarize(
plays = n(),
points_scored = sum(touchdown * 7, na.rm = TRUE) +
sum(field_goal_result == "made" * 3, na.rm = TRUE),
.groups = "drop"
) %>%
group_by(posteam) %>%
summarize(
avg_plays = mean(plays),
avg_points = mean(points_scored),
pace = avg_plays / 60 # plays per minute proxy
)
# Create totals predictions
games <- schedules %>%
filter(!is.na(result), !is.na(total)) %>%
left_join(team_scoring, by = c("home_team" = "posteam")) %>%
rename(home_points = avg_points, home_pace = pace) %>%
left_join(team_scoring %>% select(posteam, avg_points, pace),
by = c("away_team" = "posteam")) %>%
rename(away_points = avg_points, away_pace = pace) %>%
mutate(
pred_total = (home_points + away_points) *
((home_pace + away_pace) / 2),
actual_total = home_score + away_score,
over_under = if_else(actual_total > total, "Over", "Under"),
pred_over = pred_total > total,
actual_over = actual_total > total,
correct = pred_over == actual_over
)
# Model performance
cat("Correct %:", mean(games$correct) * 100, "%\n")
cat("MAE:", mean(abs(games$pred_total - games$actual_total)), "\n")
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate team scoring metrics
plays = pbp[pbp["epa"].notna()]
game_scoring = (plays.groupby(["game_id", "posteam"])
.agg(
plays=("epa", "count"),
touchdowns=("touchdown", "sum")
)
.reset_index())
game_scoring["points_approx"] = game_scoring["touchdowns"] * 7
team_scoring = (game_scoring.groupby("posteam")
.agg(
avg_plays=("plays", "mean"),
avg_points=("points_approx", "mean")
)
.reset_index())
team_scoring["pace"] = team_scoring["avg_plays"] / 60
# Create totals predictions
games = schedules[schedules["result"].notna() & schedules["total"].notna()].copy()
games = games.merge(
team_scoring[["posteam", "avg_points", "pace"]],
left_on="home_team", right_on="posteam"
).rename(columns={"avg_points": "home_points", "pace": "home_pace"})
games = games.merge(
team_scoring[["posteam", "avg_points", "pace"]],
left_on="away_team", right_on="posteam"
).rename(columns={"avg_points": "away_points", "pace": "away_pace"})
games["pred_total"] = (games["home_points"] + games["away_points"]) * \
((games["home_pace"] + games["away_pace"]) / 2)
games["actual_total"] = games["home_score"] + games["away_score"]
games["correct"] = (games["pred_total"] > games["total"]) == \
(games["actual_total"] > games["total"])
print(f"Correct %: {games['correct'].mean() * 100:.1f}%")
print(f"MAE: {(games['pred_total'] - games['actual_total']).abs().mean():.1f}")
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
# Calculate receiver baseline stats
receiver_stats <- pbp %>%
filter(!is.na(receiver_player_id), play_type == "pass") %>%
group_by(receiver_player_id, receiver_player_name, posteam) %>%
summarize(
targets = n(),
receptions = sum(complete_pass),
yards = sum(yards_gained, na.rm = TRUE),
tds = sum(touchdown),
avg_depth = mean(air_yards, na.rm = TRUE),
.groups = "drop"
) %>%
filter(targets >= 50) %>%
mutate(
catch_rate = receptions / targets,
yards_per_target = yards / targets,
yards_per_reception = yards / receptions
)
# Calculate team target rates for each receiver
team_targets <- pbp %>%
filter(play_type == "pass", !is.na(receiver_player_id)) %>%
group_by(posteam) %>%
summarize(team_targets = n())
receiver_share <- receiver_stats %>%
left_join(team_targets, by = "posteam") %>%
mutate(target_share = targets / team_targets)
# Project yards for next game
# Assume team throws ~35 passes per game
receiver_share <- receiver_share %>%
mutate(
proj_targets = target_share * 35,
proj_yards = proj_targets * yards_per_target
) %>%
arrange(desc(proj_yards))
print(receiver_share %>%
select(receiver_player_name, target_share,
yards_per_target, proj_yards) %>%
head(20))
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
# Calculate receiver baseline stats
pass_plays = pbp[(pbp["receiver_player_id"].notna()) &
(pbp["play_type"] == "pass")]
receiver_stats = (pass_plays.groupby(
["receiver_player_id", "receiver_player_name", "posteam"])
.agg(
targets=("play_id", "count"),
receptions=("complete_pass", "sum"),
yards=("yards_gained", "sum"),
tds=("touchdown", "sum"),
avg_depth=("air_yards", "mean")
)
.reset_index())
receiver_stats = receiver_stats[receiver_stats["targets"] >= 50]
receiver_stats["catch_rate"] = receiver_stats["receptions"] / receiver_stats["targets"]
receiver_stats["yards_per_target"] = receiver_stats["yards"] / receiver_stats["targets"]
# Calculate team target rates
team_targets = (pass_plays.groupby("posteam")
.size().reset_index(name="team_targets"))
receiver_share = receiver_stats.merge(team_targets, on="posteam")
receiver_share["target_share"] = receiver_share["targets"] / receiver_share["team_targets"]
# Project yards (35 passes per game assumption)
receiver_share["proj_targets"] = receiver_share["target_share"] * 35
receiver_share["proj_yards"] = receiver_share["proj_targets"] * receiver_share["yards_per_target"]
result = receiver_share.nlargest(20, "proj_yards")[
["receiver_player_name", "target_share", "yards_per_target", "proj_yards"]
]
print(result)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2020:2023)
# Initialize Elo ratings
elo <- setNames(rep(1500, 32),
unique(c(schedules$home_team, schedules$away_team)))
k_factor <- 20
# Calculate expected score
expected <- function(r1, r2) 1 / (1 + 10^((r2 - r1)/400))
# Elo calculation function
update_elo <- function(winner_rating, loser_rating, margin) {
exp_win <- expected(winner_rating, loser_rating)
mov_mult <- log(abs(margin) + 1) * 2.2 / ((winner_rating - loser_rating) * 0.001 + 2.2)
change <- k_factor * mov_mult * (1 - exp_win)
return(change)
}
# Process games
games <- schedules %>%
filter(!is.na(result)) %>%
arrange(game_id)
elo_history <- list()
for (i in seq_len(nrow(games))) {
g <- games[i,]
home_elo <- elo[g$home_team] + 55 # HFA
away_elo <- elo[g$away_team]
if (g$result > 0) { # Home win
change <- update_elo(home_elo, away_elo, g$result)
elo[g$home_team] <- elo[g$home_team] + change
elo[g$away_team] <- elo[g$away_team] - change
} else { # Away win
change <- update_elo(away_elo, home_elo, -g$result)
elo[g$away_team] <- elo[g$away_team] + change
elo[g$home_team] <- elo[g$home_team] - change
}
}
# Current Elo rankings
elo_df <- data.frame(team = names(elo), elo = elo) %>%
arrange(desc(elo))
print(elo_df)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
schedules = nfl.import_schedules([2020, 2021, 2022, 2023])
# Initialize Elo ratings
teams = pd.concat([schedules["home_team"], schedules["away_team"]]).unique()
elo = {team: 1500 for team in teams}
k_factor = 20
def expected(r1, r2):
return 1 / (1 + 10**((r2 - r1)/400))
def update_elo(winner_rating, loser_rating, margin):
exp_win = expected(winner_rating, loser_rating)
mov_mult = np.log(abs(margin) + 1) * 2.2 / ((winner_rating - loser_rating) * 0.001 + 2.2)
change = k_factor * mov_mult * (1 - exp_win)
return change
# Process games
games = schedules[schedules["result"].notna()].sort_values("game_id")
for _, g in games.iterrows():
home_elo = elo[g["home_team"]] + 55 # Home field advantage
away_elo = elo[g["away_team"]]
if g["result"] > 0: # Home win
change = update_elo(home_elo, away_elo, g["result"])
elo[g["home_team"]] += change
elo[g["away_team"]] -= change
else: # Away win
change = update_elo(away_elo, home_elo, -g["result"])
elo[g["away_team"]] += change
elo[g["home_team"]] -= change
# Current Elo rankings
elo_df = pd.DataFrame({"team": list(elo.keys()), "elo": list(elo.values())})
elo_df = elo_df.sort_values("elo", ascending=False).reset_index(drop=True)
print("Current Elo Rankings:")
print(elo_df)
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate multiple components
team_metrics <- pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(posteam) %>%
summarize(
off_epa = mean(epa),
success_rate = mean(success) * 100,
.groups = "drop"
) %>%
left_join(
pbp %>%
filter(!is.na(epa), play_type %in% c("pass", "run")) %>%
group_by(defteam) %>%
summarize(
def_epa = mean(epa),
def_success_allowed = mean(success) * 100
),
by = c("posteam" = "defteam")
)
# Get win-loss record
records <- schedules %>%
filter(!is.na(result)) %>%
pivot_longer(c(home_team, away_team), names_to = "location", values_to = "team") %>%
mutate(
win = (location == "home_team" & result > 0) | (location == "away_team" & result < 0)
) %>%
group_by(team) %>%
summarize(wins = sum(win), games = n(), win_pct = wins/games)
# Combine into power ranking
power_rankings <- team_metrics %>%
left_join(records, by = c("posteam" = "team")) %>%
mutate(
# Normalize each metric to 0-100
off_score = (off_epa - min(off_epa)) / (max(off_epa) - min(off_epa)) * 100,
def_score = 100 - (def_epa - min(def_epa)) / (max(def_epa) - min(def_epa)) * 100,
win_score = win_pct * 100,
# Composite (40% offense, 40% defense, 20% record)
power_rating = off_score * 0.4 + def_score * 0.4 + win_score * 0.2
) %>%
arrange(desc(power_rating)) %>%
mutate(rank = row_number())
print(power_rankings %>% select(posteam, rank, power_rating, off_epa, def_epa, win_pct))
import nfl_data_py as nfl
import pandas as pd
import numpy as np
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate team metrics
plays = pbp[(pbp["epa"].notna()) & (pbp["play_type"].isin(["pass", "run"]))]
off_stats = plays.groupby("posteam").agg(
off_epa=("epa", "mean"),
success_rate=("success", lambda x: x.mean() * 100)
).reset_index()
def_stats = plays.groupby("defteam").agg(
def_epa=("epa", "mean")
).reset_index()
team_metrics = off_stats.merge(def_stats, left_on="posteam", right_on="defteam")
# Get win-loss records
games = schedules[schedules["result"].notna()]
home_wins = games[games["result"] > 0].groupby("home_team").size()
away_wins = games[games["result"] < 0].groupby("away_team").size()
total_games = games.groupby("home_team").size() + games.groupby("away_team").size()
total_wins = home_wins.add(away_wins, fill_value=0)
win_pct = (total_wins / total_games).reset_index()
win_pct.columns = ["team", "win_pct"]
# Combine
power_rankings = team_metrics.merge(win_pct, left_on="posteam", right_on="team")
# Normalize
def normalize(col):
return (col - col.min()) / (col.max() - col.min()) * 100
power_rankings["off_score"] = normalize(power_rankings["off_epa"])
power_rankings["def_score"] = 100 - normalize(power_rankings["def_epa"])
power_rankings["win_score"] = power_rankings["win_pct"] * 100
power_rankings["power_rating"] = (power_rankings["off_score"] * 0.4 +
power_rankings["def_score"] * 0.4 +
power_rankings["win_score"] * 0.2)
power_rankings = power_rankings.sort_values("power_rating", ascending=False).reset_index(drop=True)
power_rankings["rank"] = range(1, len(power_rankings) + 1)
print(power_rankings[["posteam", "rank", "power_rating", "off_epa", "def_epa", "win_pct"]])
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2019:2023)
# Overall HFA
hfa_overall <- schedules %>%
filter(!is.na(result)) %>%
summarize(
games = n(),
home_wins = sum(result > 0),
ties = sum(result == 0),
away_wins = sum(result < 0),
home_win_pct = mean(result > 0) * 100,
avg_home_margin = mean(result)
)
print("Overall Home Field Advantage:")
print(hfa_overall)
# HFA by team
team_hfa <- schedules %>%
filter(!is.na(result)) %>%
group_by(home_team) %>%
summarize(
home_games = n(),
home_wins = sum(result > 0),
home_win_pct = mean(result > 0) * 100,
avg_margin = mean(result),
.groups = "drop"
) %>%
arrange(desc(home_win_pct))
print("\nHFA by Stadium:")
print(team_hfa)
# HFA by season (trend over time)
hfa_by_season <- schedules %>%
filter(!is.na(result)) %>%
group_by(season) %>%
summarize(
home_win_pct = mean(result > 0) * 100,
avg_margin = mean(result)
)
print("\nHFA Trend by Season:")
print(hfa_by_season)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2019, 2020, 2021, 2022, 2023])
# Overall HFA
games = schedules[schedules["result"].notna()]
hfa_overall = {
"games": len(games),
"home_wins": (games["result"] > 0).sum(),
"away_wins": (games["result"] < 0).sum(),
"home_win_pct": (games["result"] > 0).mean() * 100,
"avg_home_margin": games["result"].mean()
}
print("Overall Home Field Advantage:")
print(pd.DataFrame([hfa_overall]))
# HFA by team
team_hfa = (games.groupby("home_team")
.agg(
home_games=("result", "count"),
home_wins=("result", lambda x: (x > 0).sum()),
home_win_pct=("result", lambda x: (x > 0).mean() * 100),
avg_margin=("result", "mean")
)
.reset_index()
.sort_values("home_win_pct", ascending=False))
print("\nHFA by Stadium:")
print(team_hfa)
# HFA by season
hfa_by_season = (games.groupby("season")
.agg(
home_win_pct=("result", lambda x: (x > 0).mean() * 100),
avg_margin=("result", "mean")
)
.reset_index())
print("\nHFA Trend by Season:")
print(hfa_by_season)
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2018:2023)
# Calculate margin distribution
margins <- schedules %>%
filter(!is.na(result)) %>%
mutate(margin = abs(result)) %>%
group_by(margin) %>%
summarize(games = n()) %>%
mutate(pct = games / sum(games) * 100) %>%
arrange(desc(games))
# Top key numbers
key_numbers <- margins %>%
head(15) %>%
mutate(cumulative_pct = cumsum(pct))
print("Top 15 Final Margins:")
print(key_numbers)
# Games landing on 3 and 7
on_3_or_7 <- schedules %>%
filter(!is.na(result)) %>%
mutate(
on_3 = abs(result) == 3,
on_7 = abs(result) == 7,
on_key = abs(result) %in% c(3, 7, 10, 14)
) %>%
summarize(
total_games = n(),
on_3 = sum(on_3),
on_7 = sum(on_7),
on_3_pct = mean(on_3) * 100,
on_7_pct = mean(on_7) * 100,
any_key_pct = mean(on_key) * 100
)
print("\nKey Number Summary:")
print(on_3_or_7)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2018, 2019, 2020, 2021, 2022, 2023])
# Calculate margin distribution
games = schedules[schedules["result"].notna()].copy()
games["margin"] = games["result"].abs()
margins = (games.groupby("margin")
.size()
.reset_index(name="games")
.sort_values("games", ascending=False))
margins["pct"] = margins["games"] / margins["games"].sum() * 100
margins["cumulative_pct"] = margins["pct"].cumsum()
print("Top 15 Final Margins:")
print(margins.head(15))
# Key number analysis
games["on_3"] = games["margin"] == 3
games["on_7"] = games["margin"] == 7
games["on_key"] = games["margin"].isin([3, 7, 10, 14])
key_summary = {
"total_games": len(games),
"on_3": games["on_3"].sum(),
"on_7": games["on_7"].sum(),
"on_3_pct": games["on_3"].mean() * 100,
"on_7_pct": games["on_7"].mean() * 100,
"any_key_pct": games["on_key"].mean() * 100
}
print("\nKey Number Summary:")
print(pd.DataFrame([key_summary]))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2023)
# Analyze spread movement patterns
spread_analysis <- schedules %>%
filter(!is.na(result), !is.na(spread_line)) %>%
mutate(
# Positive result means home win
home_covered = result + spread_line > 0,
favorite = if_else(spread_line < 0, "Home", "Away"),
spread_bucket = case_when(
abs(spread_line) <= 3 ~ "Pick/FG",
abs(spread_line) <= 7 ~ "4-7 pts",
abs(spread_line) <= 10 ~ "8-10 pts",
TRUE ~ "10+ pts"
)
)
# ATS performance by spread size
ats_by_spread <- spread_analysis %>%
group_by(spread_bucket) %>%
summarize(
games = n(),
home_cover_pct = mean(home_covered) * 100,
avg_margin = mean(result + spread_line),
.groups = "drop"
)
print("ATS Performance by Spread Size:")
print(ats_by_spread)
# Favorite vs underdog ATS
fav_vs_dog <- spread_analysis %>%
mutate(
favorite_covered = (favorite == "Home" & home_covered) |
(favorite == "Away" & !home_covered)
) %>%
summarize(
games = n(),
favorite_cover_pct = mean(favorite_covered) * 100,
dog_cover_pct = (1 - mean(favorite_covered)) * 100
)
print("\nFavorite vs Underdog ATS:")
print(fav_vs_dog)
import nfl_data_py as nfl
import pandas as pd
import numpy as np
schedules = nfl.import_schedules([2023])
# Analyze spread patterns
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games["home_covered"] = games["result"] + games["spread_line"] > 0
games["favorite"] = np.where(games["spread_line"] < 0, "Home", "Away")
def spread_bucket(s):
s = abs(s)
if s <= 3: return "Pick/FG"
elif s <= 7: return "4-7 pts"
elif s <= 10: return "8-10 pts"
else: return "10+ pts"
games["spread_bucket"] = games["spread_line"].apply(spread_bucket)
# ATS performance by spread size
ats_by_spread = (games.groupby("spread_bucket")
.agg(
games=("home_covered", "count"),
home_cover_pct=("home_covered", lambda x: x.mean() * 100),
avg_margin=("result", lambda x: (x + games.loc[x.index, "spread_line"]).mean())
)
.reset_index())
print("ATS Performance by Spread Size:")
print(ats_by_spread)
# Favorite vs underdog
games["favorite_covered"] = ((games["favorite"] == "Home") & games["home_covered"]) | \
((games["favorite"] == "Away") & ~games["home_covered"])
fav_cover_pct = games["favorite_covered"].mean() * 100
print(f"\nFavorite cover %: {fav_cover_pct:.1f}%")
print(f"Underdog cover %: {100 - fav_cover_pct:.1f}%")
nflfastR
tidyverse
nfl_data_py
pandas
numpy
library(nflfastR)
library(tidyverse)
pbp <- load_pbp(2023)
schedules <- load_schedules(2023)
# Calculate first half scores
first_half <- pbp %>%
filter(qtr <= 2) %>%
group_by(game_id) %>%
summarize(
home_1h = sum(home_score[play_id == max(play_id)]),
away_1h = sum(away_score[play_id == max(play_id)]),
.groups = "drop"
)
# Get final scores
final_scores <- schedules %>%
filter(!is.na(result)) %>%
select(game_id, home_score, away_score, spread_line, result)
# Combine
game_halves <- final_scores %>%
left_join(first_half, by = "game_id") %>%
mutate(
home_2h = home_score - home_1h,
away_2h = away_score - away_1h,
result_1h = home_1h - away_1h,
result_2h = home_2h - away_2h,
# Compare halves
home_stronger_2h = result_2h > result_1h,
halves_same_winner = (result_1h > 0) == (result > 0)
)
# Analysis
half_analysis <- game_halves %>%
summarize(
games = n(),
avg_1h_margin = mean(result_1h),
avg_2h_margin = mean(result_2h),
same_winner_pct = mean(halves_same_winner, na.rm = TRUE) * 100,
home_stronger_2h_pct = mean(home_stronger_2h, na.rm = TRUE) * 100
)
print("First Half vs Second Half Analysis:")
print(half_analysis)
import nfl_data_py as nfl
import pandas as pd
pbp = nfl.import_pbp_data([2023])
schedules = nfl.import_schedules([2023])
# Calculate first half scores
first_half = (pbp[pbp["qtr"] <= 2]
.groupby("game_id")
.agg(
home_1h=("home_score", "last"),
away_1h=("away_score", "last")
)
.reset_index())
# Get final scores
final_scores = schedules[schedules["result"].notna()][
["game_id", "home_score", "away_score", "spread_line", "result"]
]
# Combine
game_halves = final_scores.merge(first_half, on="game_id")
game_halves["home_2h"] = game_halves["home_score"] - game_halves["home_1h"]
game_halves["away_2h"] = game_halves["away_score"] - game_halves["away_1h"]
game_halves["result_1h"] = game_halves["home_1h"] - game_halves["away_1h"]
game_halves["result_2h"] = game_halves["home_2h"] - game_halves["away_2h"]
game_halves["same_winner"] = (game_halves["result_1h"] > 0) == (game_halves["result"] > 0)
game_halves["home_stronger_2h"] = game_halves["result_2h"] > game_halves["result_1h"]
# Analysis
half_analysis = {
"games": len(game_halves),
"avg_1h_margin": game_halves["result_1h"].mean(),
"avg_2h_margin": game_halves["result_2h"].mean(),
"same_winner_pct": game_halves["same_winner"].mean() * 100,
"home_stronger_2h_pct": game_halves["home_stronger_2h"].mean() * 100
}
print("First Half vs Second Half Analysis:")
print(pd.DataFrame([half_analysis]))
nflfastR
tidyverse
nfl_data_py
pandas
library(nflfastR)
library(tidyverse)
schedules <- load_schedules(2019:2023)
# Define divisions
divisions <- list(
AFC_East = c("BUF", "MIA", "NE", "NYJ"),
AFC_North = c("BAL", "CIN", "CLE", "PIT"),
AFC_South = c("HOU", "IND", "JAX", "TEN"),
AFC_West = c("DEN", "KC", "LV", "LAC"),
NFC_East = c("DAL", "NYG", "PHI", "WAS"),
NFC_North = c("CHI", "DET", "GB", "MIN"),
NFC_South = c("ATL", "CAR", "NO", "TB"),
NFC_West = c("ARI", "LAR", "SF", "SEA")
)
# Function to find division
get_division <- function(team) {
for (div in names(divisions)) {
if (team %in% divisions[[div]]) return(div)
}
return(NA)
}
# Analyze division games
div_games <- schedules %>%
filter(!is.na(result), !is.na(spread_line)) %>%
rowwise() %>%
mutate(
home_div = get_division(home_team),
away_div = get_division(away_team),
is_division_game = home_div == away_div
) %>%
ungroup()
# Compare division vs non-division
div_comparison <- div_games %>%
group_by(is_division_game) %>%
summarize(
games = n(),
home_win_pct = mean(result > 0) * 100,
home_cover_pct = mean(result + spread_line > 0) * 100,
avg_total = mean(home_score + away_score),
avg_margin = mean(abs(result)),
.groups = "drop"
)
print("Division vs Non-Division Game Analysis:")
print(div_comparison)
import nfl_data_py as nfl
import pandas as pd
schedules = nfl.import_schedules([2019, 2020, 2021, 2022, 2023])
# Define divisions
divisions = {
"AFC_East": ["BUF", "MIA", "NE", "NYJ"],
"AFC_North": ["BAL", "CIN", "CLE", "PIT"],
"AFC_South": ["HOU", "IND", "JAX", "TEN"],
"AFC_West": ["DEN", "KC", "LV", "LAC"],
"NFC_East": ["DAL", "NYG", "PHI", "WAS"],
"NFC_North": ["CHI", "DET", "GB", "MIN"],
"NFC_South": ["ATL", "CAR", "NO", "TB"],
"NFC_West": ["ARI", "LAR", "SF", "SEA"]
}
def get_division(team):
for div, teams in divisions.items():
if team in teams:
return div
return None
# Analyze division games
games = schedules[(schedules["result"].notna()) & (schedules["spread_line"].notna())].copy()
games["home_div"] = games["home_team"].apply(get_division)
games["away_div"] = games["away_team"].apply(get_division)
games["is_division_game"] = games["home_div"] == games["away_div"]
# Compare division vs non-division
div_comparison = (games.groupby("is_division_game")
.agg(
games=("result", "count"),
home_win_pct=("result", lambda x: (x > 0).mean() * 100),
home_cover_pct=("result", lambda x: ((x + games.loc[x.index, "spread_line"]) > 0).mean() * 100),
avg_total=("home_score", lambda x: (x + games.loc[x.index, "away_score"]).mean()),
avg_margin=("result", lambda x: x.abs().mean())
)
.reset_index())
div_comparison["is_division_game"] = div_comparison["is_division_game"].map({True: "Division", False: "Non-Division"})
print("Division vs Non-Division Game Analysis:")
print(div_comparison)
nflfastR
tidyverse
nfl_data_py
pandas
nflfastR - Play-by-play data with EPAnflplotR - NFL team logos & plottingtidyverse - Data manipulation & visualizationggplot2 - Advanced visualizationsnfl_data_py - NFL data (nflverse compatible)pandas - Data manipulationmatplotlib - Visualizationsscikit-learn - Machine learningLearn the theory behind these techniques in our comprehensive tutorial series
Browse Tutorials