fix passing

sportsdataverse · Sep 1, 2024 · a041ca7 · a041ca7
1 parent 291e9cf
commit a041ca7
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 14 deletions.
diff --git a/setup.py b/setup.py
@@ -38,7 +38,7 @@
     # Versions should comply with PEP440.  For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version="0.0.36.2.12",
+    version="0.0.36.2.13",
     description="Retrieve Sports data in Python",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/sportsdataverse/cfb/cfb_pbp.py b/sportsdataverse/cfb/cfb_pbp.py
@@ -2309,6 +2309,7 @@ def __add_play_category_flags(self, play_df):
             [True, True, True],
             default=False,
         )
+        play_df['dropback'] = (play_df["pass"] == True) | (play_df['sack_vec'] == True)
 
         play_df["target"] = np.select(
             [
@@ -2693,43 +2694,72 @@ def __add_yardage_cols(self, play_df):
         play_df["yds_receiving"] = np.select(
             [
                 (play_df["pass"] == True)
-                & (play_df.text.str.contains("complete to", case=False))
+                & (play_df.text.str.contains(" complete to", case=False))
                 & (play_df.text.str.contains(r"for no gain", case=False)),
+
                 (play_df["pass"] == True)
-                & (play_df.text.str.contains("complete to", case=False))
+                & (play_df.text.str.contains(" complete to", case=False))
                 & (play_df.text.str.contains("for a loss", case=False)),
+
                 (play_df["pass"] == True)
-                & (play_df.text.str.contains("complete to", case=False)),
+                & (play_df.text.str.contains(" complete to", case=False))#,
+                & (play_df.text.str.contains(" for .* y\w*ds?", regex = True, case = False)),
+
                 (play_df["pass"] == True)
-                & (play_df.text.str.contains("complete to", case=False)),
+                & (play_df.text.str.contains(" complete to", case=False)),
+
+                (play_df["pass"] == True)
+                & (play_df.text.str.contains(" complete to", case=False)),
+
                 (play_df["pass"] == True)
                 & (play_df.text.str.contains("incomplete", case=False)),
+
                 (play_df["pass"] == True)
                 & (play_df["type.text"].str.contains("incompletion", case=False)),
+
                 (play_df["pass"] == True)
                 & (play_df.text.str.contains("Yd pass", case=False)),
             ],
             [
                 0.0,
+
                 -1
                 * play_df.text.str.extract(
                     r"((?<=for a loss of)[^,]+)", flags=re.IGNORECASE
                 )[0]
                 .str.extract(r"(\d+)")[0]
                 .astype(float),
+
                 play_df.text.str.extract(r"((?<=for)[^,]+)", flags=re.IGNORECASE)[0]
                 .str.extract(r"(\d+)")[0]
                 .astype(float),
+
+                play_df['start.yardsToEndzone'] - play_df['end.yardsToEndzone'],
+
                 play_df.text.str.extract(r"((?<=for)[^,]+)", flags=re.IGNORECASE)[0]
                 .str.extract(r"(\d+)")[0]
                 .astype(float),
+
                 0.0,
+
                 0.0,
+
                 play_df.text.str.extract(r"(\d+)\s+Yd\s+pass", flags=re.IGNORECASE)[0]
                 .str.extract(r"(\d+)")[0]
                 .astype(float),
             ],
-            default=None,
+            default = None,
+        )
+        play_df['statYardage'] = np.select(
+            [
+                (play_df["pass"] == True)
+                & (play_df.text.str.contains(" complete to ", case=False)) 
+                & (play_df['statYardage'] == 0)
+            ],
+            [
+                play_df['yds_receiving']
+            ],
+            default = play_df['statYardage']
         )
 
         play_df["yds_int_return"] = None
@@ -2952,6 +2982,15 @@ def __add_yardage_cols(self, play_df):
             ],
             default=None,
         )
+        play_df["yds_passing"] = np.select(
+            [
+                play_df.sack == True,
+            ],
+            [
+                play_df.yds_sacked
+            ],
+            default = play_df["yds_receiving"]
+        )
 
         play_df["yds_penalty"] = np.select(
             [(play_df.penalty_detail == 1)],
@@ -4108,14 +4147,15 @@ def __after_cols(self, play_df):
             [
                 (play_df["type.text"] != "Penalty")
                 & (play_df.sp == False)
-                & (play_df.statYardage < 0),
+                & (play_df.statYardage < 0) 
+                & (play_df["int"] == False), # INT can't be a TFL
                 (play_df["sack_vec"] == True),
             ],
             [True, True],
             default=False,
         )
         play_df["TFL_pass"] = np.where(
-            (play_df["TFL"] == True) & (play_df["pass"] == True), True, False
+            (play_df["TFL"] == True) & (play_df["pass"] == True) & (play_df["int"] == False), True, False
         )
         play_df["TFL_rush"] = np.where(
             (play_df["TFL"] == True) & (play_df["rush"] == True), True, False
@@ -5116,15 +5156,16 @@ def create_box_score(self):
         passer_box = pass_box[(pass_box["pass"] == True) & (pass_box["scrimmage_play"] == True)].fillna(0.0).groupby(by=["pos_team","passer_player_name"], as_index=False, group_keys = False).agg(
             Comp = ('completion', sum),
             Att = ('pass_attempt',sum),
-            Yds = ('yds_receiving',sum),
+            Yds = ('yds_passing', sum),
             Pass_TD = ('pass_td', sum),
             Int = ('int', sum),
-            YPA = ('yds_receiving', mean),
+            YPA = ('yds_passing', mean),
             EPA = ('EPA', sum),
             EPA_per_Play = ('EPA', mean),
             WPA = ('wpa', sum),
             SR = ('EPA_success', mean),
-            Sck = ('sack_vec', sum)
+            Sck = ('sack_vec', sum),
+            SckYds = ('yds_sacked', sum)
         ).round(2)
         passer_box = passer_box.replace({np.nan: None})
         qbs_list = passer_box.passer_player_name.to_list()

diff --git a/tests/cfb/test_pbp.py b/tests/cfb/test_pbp.py
@@ -31,15 +31,29 @@ def test_adv_box_score(box_score):
     assert box_score != None
     assert len(set(box_score.keys()).difference({"win_pct","pass","team","situational","receiver","rush","receiver","defensive","turnover","drives"})) == 0
 
-def test_havoc_rate(box_score):
+def test_havoc_rate(generated_data):
+    generated_data.run_processing_pipeline()
+    box_score = generated_data.create_box_score()
+
+
     defense_home = box_score["defensive"][0]
     # print(defense_home)
     pd = defense_home.get("pass_breakups", 0)
-    home_int = defense_home.get("Int", 0)
+    home_int = defense_home.get("def_int", 0)
     tfl = defense_home.get("TFL", 0)
     fum = defense_home.get("fumbles", 0)
     plays = defense_home.get("scrimmage_plays", 0)
 
+    # mask = (generated_data.plays_json.statYardage < 0) & (generated_data.plays_json.penalty_flag == False) & (generated_data.plays_json["start.team.id"] != 2567)
+    # LOGGER.info(generated_data.plays_json[mask][["id", "text", "statYardage", "havoc", "start.down", "start.yardsToEndzone", "end.down", "end.yardsToEndzone", "int", "forced_fumble"]].to_json(orient = "records", indent = 2))
+    LOGGER.info(generated_data.plays_json[(generated_data.plays_json.havoc == True) & (generated_data.plays_json.penalty_flag == False) & (generated_data.plays_json["start.team.id"] != 2567)][["id", "text", "statYardage", "havoc", "start.down", "start.yardsToEndzone", "end.down", "end.yardsToEndzone", "int", "forced_fumble", "TFL", "TFL_pass", "TFL_rush"]].to_json(orient = "records", indent = 2))
+    LOGGER.info({
+        "pd": pd,
+        "home_int": home_int,
+        "tfl": tfl,
+        "fum": fum
+    })
+
     assert plays > 0
     assert defense_home["havoc_total"] == (pd + home_int + tfl + fum)
     assert round(defense_home["havoc_total_rate"], 4) == round(((pd + home_int + tfl + fum) / plays), 4)
@@ -358,4 +372,59 @@ def test_available_yards():
 
     # plays = test.plays_json
 
-    LOGGER.info(box)
+    LOGGER.info(box)
+
+
+def test_bugged_pass_yards():
+    test = CFBPlayProcess(gameId = 401628456)     # known bugged game - 2024 W1: Idaho vs Oregon
+    test.espn_cfb_pbp()
+    json_dict_stuff = test.run_processing_pipeline()
+
+    plays = test.plays_json
+    bad_yards_play = plays[
+        ((plays['text'].str.contains(" pass complete ")) & (plays['start.team.id'] == 70)) # Idaho passing yards
+        |  ((plays['text'].str.contains(" sacked ")) & (plays['start.team.id'] == 70))
+    ]
+    LOGGER.info(bad_yards_play[["id", "text", "yds_receiving", "statYardage", "start.yardsToEndzone", "end.yardsToEndzone", "yds_sacked"]].to_json(orient = "records", indent = 2))
+
+    box = test.create_box_score()
+    LOGGER.info(box['pass'][0])
+    LOGGER.info(box['rush'][0])
+
+    assert box['pass'][0]['Yds'] == (168 - 25) # make sure a bugged game matches the right total
+    assert box['rush'][0]['Yds'] == 47 # rush totals should not have been changed
+
+    # make sure sack yardage is accounted for
+    assert list(filter(lambda x: x['passer_player_name'] == "Dillon Gabriel", box['pass']))[0]['Yds'] == (380 - 23) # make sure a bugged game matches the right total
+
+
+    # known good game - 2024 W1: GAST vs Georgia Tech
+    good = CFBPlayProcess(gameId = 401634302)     # known bugged game - 2024 W1: Idaho vs Oregon
+    good.espn_cfb_pbp()
+    good_json = good.run_processing_pipeline()
+
+    good_plays = good.plays_json
+    good_yards_play = good_plays[
+        (good_plays['text'].str.contains(" pass complete ")) & (good_plays['start.team.id'] == 59) # GT passing yards
+    ]
+    LOGGER.info(good_yards_play[["id", "text", "yds_receiving", "statYardage", "start.yardsToEndzone", "end.yardsToEndzone"]].to_json(orient = "records", indent = 2))
+
+    good_box = good.create_box_score()
+    LOGGER.info(good_box['pass'][1])
+    LOGGER.info(good_box['rush'][1])
+
+    assert good_box['pass'][1]['Yds'] == -1 # make sure a non-bugged game matches the right total
+    assert good_box['rush'][1]['Yds'] == 20 # rush totals should not have been changed
+
+    # edge case: completed pass, fumble, recovery
+    edge = CFBPlayProcess(gameId = 401634169)
+
+    edge.espn_cfb_pbp()
+    edge_json = edge.run_processing_pipeline()
+
+    edge_plays = edge.plays_json
+    edge_yards_play = edge_plays[
+        (edge_plays['text'].str.contains("Hudson Card pass complete to Drew Biber for 2 yds fumbled, forced by Maddix Blackwell, recovered by INST Garret Ollendieck G. Ollendieck return for 0 yds"))
+    ]
+    LOGGER.info(edge_yards_play[["id", "text", "yds_receiving", "statYardage", "start.yardsToEndzone", "end.yardsToEndzone"]].to_json(orient = "records", indent = 2))
+    assert edge_yards_play.loc[edge_yards_play.index[0], 'yds_receiving'] == 2