Skip to content

Commit

Permalink
fix passing
Browse files Browse the repository at this point in the history
  • Loading branch information
akeaswaran committed Sep 1, 2024
1 parent 291e9cf commit a041ca7
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 14 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version="0.0.36.2.12",
version="0.0.36.2.13",
description="Retrieve Sports data in Python",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
61 changes: 51 additions & 10 deletions sportsdataverse/cfb/cfb_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2309,6 +2309,7 @@ def __add_play_category_flags(self, play_df):
[True, True, True],
default=False,
)
play_df['dropback'] = (play_df["pass"] == True) | (play_df['sack_vec'] == True)

play_df["target"] = np.select(
[
Expand Down Expand Up @@ -2693,43 +2694,72 @@ def __add_yardage_cols(self, play_df):
play_df["yds_receiving"] = np.select(
[
(play_df["pass"] == True)
& (play_df.text.str.contains("complete to", case=False))
& (play_df.text.str.contains(" complete to", case=False))
& (play_df.text.str.contains(r"for no gain", case=False)),

(play_df["pass"] == True)
& (play_df.text.str.contains("complete to", case=False))
& (play_df.text.str.contains(" complete to", case=False))
& (play_df.text.str.contains("for a loss", case=False)),

(play_df["pass"] == True)
& (play_df.text.str.contains("complete to", case=False)),
& (play_df.text.str.contains(" complete to", case=False))#,
& (play_df.text.str.contains(" for .* y\w*ds?", regex = True, case = False)),

(play_df["pass"] == True)
& (play_df.text.str.contains("complete to", case=False)),
& (play_df.text.str.contains(" complete to", case=False)),

(play_df["pass"] == True)
& (play_df.text.str.contains(" complete to", case=False)),

(play_df["pass"] == True)
& (play_df.text.str.contains("incomplete", case=False)),

(play_df["pass"] == True)
& (play_df["type.text"].str.contains("incompletion", case=False)),

(play_df["pass"] == True)
& (play_df.text.str.contains("Yd pass", case=False)),
],
[
0.0,

-1
* play_df.text.str.extract(
r"((?<=for a loss of)[^,]+)", flags=re.IGNORECASE
)[0]
.str.extract(r"(\d+)")[0]
.astype(float),

play_df.text.str.extract(r"((?<=for)[^,]+)", flags=re.IGNORECASE)[0]
.str.extract(r"(\d+)")[0]
.astype(float),

play_df['start.yardsToEndzone'] - play_df['end.yardsToEndzone'],

play_df.text.str.extract(r"((?<=for)[^,]+)", flags=re.IGNORECASE)[0]
.str.extract(r"(\d+)")[0]
.astype(float),

0.0,

0.0,

play_df.text.str.extract(r"(\d+)\s+Yd\s+pass", flags=re.IGNORECASE)[0]
.str.extract(r"(\d+)")[0]
.astype(float),
],
default=None,
default = None,
)
play_df['statYardage'] = np.select(
[
(play_df["pass"] == True)
& (play_df.text.str.contains(" complete to ", case=False))
& (play_df['statYardage'] == 0)
],
[
play_df['yds_receiving']
],
default = play_df['statYardage']
)

play_df["yds_int_return"] = None
Expand Down Expand Up @@ -2952,6 +2982,15 @@ def __add_yardage_cols(self, play_df):
],
default=None,
)
play_df["yds_passing"] = np.select(
[
play_df.sack == True,
],
[
play_df.yds_sacked
],
default = play_df["yds_receiving"]
)

play_df["yds_penalty"] = np.select(
[(play_df.penalty_detail == 1)],
Expand Down Expand Up @@ -4108,14 +4147,15 @@ def __after_cols(self, play_df):
[
(play_df["type.text"] != "Penalty")
& (play_df.sp == False)
& (play_df.statYardage < 0),
& (play_df.statYardage < 0)
& (play_df["int"] == False), # INT can't be a TFL
(play_df["sack_vec"] == True),
],
[True, True],
default=False,
)
play_df["TFL_pass"] = np.where(
(play_df["TFL"] == True) & (play_df["pass"] == True), True, False
(play_df["TFL"] == True) & (play_df["pass"] == True) & (play_df["int"] == False), True, False
)
play_df["TFL_rush"] = np.where(
(play_df["TFL"] == True) & (play_df["rush"] == True), True, False
Expand Down Expand Up @@ -5116,15 +5156,16 @@ def create_box_score(self):
passer_box = pass_box[(pass_box["pass"] == True) & (pass_box["scrimmage_play"] == True)].fillna(0.0).groupby(by=["pos_team","passer_player_name"], as_index=False, group_keys = False).agg(
Comp = ('completion', sum),
Att = ('pass_attempt',sum),
Yds = ('yds_receiving',sum),
Yds = ('yds_passing', sum),
Pass_TD = ('pass_td', sum),
Int = ('int', sum),
YPA = ('yds_receiving', mean),
YPA = ('yds_passing', mean),
EPA = ('EPA', sum),
EPA_per_Play = ('EPA', mean),
WPA = ('wpa', sum),
SR = ('EPA_success', mean),
Sck = ('sack_vec', sum)
Sck = ('sack_vec', sum),
SckYds = ('yds_sacked', sum)
).round(2)
passer_box = passer_box.replace({np.nan: None})
qbs_list = passer_box.passer_player_name.to_list()
Expand Down
75 changes: 72 additions & 3 deletions tests/cfb/test_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,29 @@ def test_adv_box_score(box_score):
assert box_score != None
assert len(set(box_score.keys()).difference({"win_pct","pass","team","situational","receiver","rush","receiver","defensive","turnover","drives"})) == 0

def test_havoc_rate(box_score):
def test_havoc_rate(generated_data):
generated_data.run_processing_pipeline()
box_score = generated_data.create_box_score()


defense_home = box_score["defensive"][0]
# print(defense_home)
pd = defense_home.get("pass_breakups", 0)
home_int = defense_home.get("Int", 0)
home_int = defense_home.get("def_int", 0)
tfl = defense_home.get("TFL", 0)
fum = defense_home.get("fumbles", 0)
plays = defense_home.get("scrimmage_plays", 0)

# mask = (generated_data.plays_json.statYardage < 0) & (generated_data.plays_json.penalty_flag == False) & (generated_data.plays_json["start.team.id"] != 2567)
# LOGGER.info(generated_data.plays_json[mask][["id", "text", "statYardage", "havoc", "start.down", "start.yardsToEndzone", "end.down", "end.yardsToEndzone", "int", "forced_fumble"]].to_json(orient = "records", indent = 2))
LOGGER.info(generated_data.plays_json[(generated_data.plays_json.havoc == True) & (generated_data.plays_json.penalty_flag == False) & (generated_data.plays_json["start.team.id"] != 2567)][["id", "text", "statYardage", "havoc", "start.down", "start.yardsToEndzone", "end.down", "end.yardsToEndzone", "int", "forced_fumble", "TFL", "TFL_pass", "TFL_rush"]].to_json(orient = "records", indent = 2))
LOGGER.info({
"pd": pd,
"home_int": home_int,
"tfl": tfl,
"fum": fum
})

assert plays > 0
assert defense_home["havoc_total"] == (pd + home_int + tfl + fum)
assert round(defense_home["havoc_total_rate"], 4) == round(((pd + home_int + tfl + fum) / plays), 4)
Expand Down Expand Up @@ -358,4 +372,59 @@ def test_available_yards():

# plays = test.plays_json

LOGGER.info(box)
LOGGER.info(box)


def test_bugged_pass_yards():
test = CFBPlayProcess(gameId = 401628456) # known bugged game - 2024 W1: Idaho vs Oregon
test.espn_cfb_pbp()
json_dict_stuff = test.run_processing_pipeline()

plays = test.plays_json
bad_yards_play = plays[
((plays['text'].str.contains(" pass complete ")) & (plays['start.team.id'] == 70)) # Idaho passing yards
| ((plays['text'].str.contains(" sacked ")) & (plays['start.team.id'] == 70))
]
LOGGER.info(bad_yards_play[["id", "text", "yds_receiving", "statYardage", "start.yardsToEndzone", "end.yardsToEndzone", "yds_sacked"]].to_json(orient = "records", indent = 2))

box = test.create_box_score()
LOGGER.info(box['pass'][0])
LOGGER.info(box['rush'][0])

assert box['pass'][0]['Yds'] == (168 - 25) # make sure a bugged game matches the right total
assert box['rush'][0]['Yds'] == 47 # rush totals should not have been changed

# make sure sack yardage is accounted for
assert list(filter(lambda x: x['passer_player_name'] == "Dillon Gabriel", box['pass']))[0]['Yds'] == (380 - 23) # make sure a bugged game matches the right total


# known good game - 2024 W1: GAST vs Georgia Tech
good = CFBPlayProcess(gameId = 401634302) # known bugged game - 2024 W1: Idaho vs Oregon
good.espn_cfb_pbp()
good_json = good.run_processing_pipeline()

good_plays = good.plays_json
good_yards_play = good_plays[
(good_plays['text'].str.contains(" pass complete ")) & (good_plays['start.team.id'] == 59) # GT passing yards
]
LOGGER.info(good_yards_play[["id", "text", "yds_receiving", "statYardage", "start.yardsToEndzone", "end.yardsToEndzone"]].to_json(orient = "records", indent = 2))

good_box = good.create_box_score()
LOGGER.info(good_box['pass'][1])
LOGGER.info(good_box['rush'][1])

assert good_box['pass'][1]['Yds'] == -1 # make sure a non-bugged game matches the right total
assert good_box['rush'][1]['Yds'] == 20 # rush totals should not have been changed

# edge case: completed pass, fumble, recovery
edge = CFBPlayProcess(gameId = 401634169)

edge.espn_cfb_pbp()
edge_json = edge.run_processing_pipeline()

edge_plays = edge.plays_json
edge_yards_play = edge_plays[
(edge_plays['text'].str.contains("Hudson Card pass complete to Drew Biber for 2 yds fumbled, forced by Maddix Blackwell, recovered by INST Garret Ollendieck G. Ollendieck return for 0 yds"))
]
LOGGER.info(edge_yards_play[["id", "text", "yds_receiving", "statYardage", "start.yardsToEndzone", "end.yardsToEndzone"]].to_json(orient = "records", indent = 2))
assert edge_yards_play.loc[edge_yards_play.index[0], 'yds_receiving'] == 2

0 comments on commit a041ca7

Please sign in to comment.