jldbc · ss77995ss · Jan 21, 2024 · Jan 21, 2024 · Jan 21, 2024 · Jan 21, 2024
diff --git a/docs/fangraphs.md b/docs/fangraphs.md
@@ -18,6 +18,7 @@
 | team             | str              | Team to filter data by. <br> Specify "0,ts" to get aggregate team data.
 | position         | str              | Position to filter data by. <br> Default = ALL
 | max_results      | int              | The maximum number of results to return. <br> Default = 1000000 (In effect, all results)
+| legacy           | bool             | Flag to select whether to get data from legacy leaderboard page or not <br> Default = False
 
 
 ## Usage

diff --git a/pybaseball/datasources/fangraphs.py b/pybaseball/datasources/fangraphs.py
@@ -11,7 +11,8 @@
                                stat_list_from_str, stat_list_to_str)
 from .html_table_processor import HTMLTableProcessor, RowIdFunction
 
-_FG_LEADERS_URL = "/leaders-legacy.aspx"
+_FG_LEADERS_URL = "/api/leaders/major-league/data"
+_LEGACY_FG_LEADERS_URL = "/leaders-legacy.aspx"
 
 MIN_AGE = 0
 MAX_AGE = 100
@@ -43,6 +44,7 @@ class FangraphsDataTable(ABC):
     DATA_ROWS_XPATH: str = "({TABLE_XPATH}/tbody//tr)"
     DATA_CELLS_XPATH: str = "td[position()>1]/descendant-or-self::*/text()"
     QUERY_ENDPOINT: str = _FG_LEADERS_URL
+    LEGACY_QUERY_ENDPOINT: str = _LEGACY_FG_LEADERS_URL
     STATS_CATEGORY: FangraphsStatsCategory = FangraphsStatsCategory.NONE
     DEFAULT_STAT_COLUMNS: List[FangraphsStatColumn] = []
     KNOWN_PERCENTAGES: List[str] = []
@@ -77,7 +79,7 @@ def fetch(self, start_season: int, end_season: Optional[int] = None, league: str
               stat_columns: Union[str, List[str]] = 'ALL', qual: Optional[int] = None, split_seasons: bool = True,
               month: str = 'ALL', on_active_roster: bool = False, minimum_age: int = MIN_AGE,
               maximum_age: int = MAX_AGE, team: str = '', _filter: str = '', players: str = '',
-              position: str = 'ALL', max_results: int = 1000000,) -> pd.DataFrame:
+              position: str = 'ALL', max_results: int = 1000000, legacy: bool = False) -> pd.DataFrame:
 
         """
         Get leaderboard data from Fangraphs.
@@ -146,22 +148,26 @@ def fetch(self, start_season: int, end_season: Optional[int] = None, league: str
             'age': f"{minimum_age},{maximum_age}",
             'filter': _filter,
             'players': players,
-            'page': f'1_{max_results}'
+            'page': f'1_{max_results}',
+            'pageitems': max_results # New Fangraphs Leaderboard uses pageitems to get maximum results per page
         }
 
-        return self._validate(
-            self._postprocess(
-                self.html_accessor.get_tabular_data_from_options(
-                    self.QUERY_ENDPOINT,
-                    query_params=url_options,
-                    # TODO: Remove the type: ignore after this is fixed: https://github.com/python/mypy/issues/5485
-                    column_name_mapper=self.COLUMN_NAME_MAPPER, # type: ignore
-                    known_percentages=self.KNOWN_PERCENTAGES,
-                    row_id_func=self.ROW_ID_FUNC,
-                    row_id_name=self.ROW_ID_NAME,
-                )
-            )
-        )
+        # Add `legacy` flag to let users decide whether use legacy api or not
+        tabular_data = self.html_accessor.get_tabular_data_from_options(
+                            self.LEGACY_QUERY_ENDPOINT,
+                            query_params=url_options,
+                            # TODO: Remove the type: ignore after this is fixed: https://github.com/python/mypy/issues/5485
+                            column_name_mapper=self.COLUMN_NAME_MAPPER,  # type: ignore
+                            known_percentages=self.KNOWN_PERCENTAGES,
+                            row_id_func=self.ROW_ID_FUNC,
+                            row_id_name=self.ROW_ID_NAME,
+                        ) if legacy else self.html_accessor.get_tabular_data_from_api(
+                                            f"{self.ROOT_URL}{self.QUERY_ENDPOINT}",
+                                            query_params=url_options
+                                        )
+
+
+        return self._validate(self._postprocess(tabular_data))
 
 class FangraphsBattingStatsTable(FangraphsDataTable):
     STATS_CATEGORY: FangraphsStatsCategory = FangraphsStatsCategory.BATTING

diff --git a/pybaseball/datasources/html_table_processor.py b/pybaseball/datasources/html_table_processor.py
@@ -4,6 +4,8 @@
 import lxml.etree
 import pandas as pd
 import requests
+import json
+import re
 
 from ..datahelpers import postprocessing
 from ..datahelpers.column_mapper import ColumnListMapperFunction
@@ -95,3 +97,21 @@ def get_tabular_data_from_options(self, base_url: str, query_params: Dict[str, U
             row_id_func=row_id_func,
             row_id_name=row_id_name,
         )
+
+    def get_tabular_data_from_api(self, base_url: str, query_params: Dict[str, Union[str, int]]):
+        # Newest Fangraphs Leaderboard API will return html tag in `Name` and `Team` column
+        # Therefore we need to extract the name and team from response result
+        def extract_text_from_html(text):
+            try:
+                return re.search('>(.+?)<', text).group(1)
+            except AttributeError:
+                return text
+
+        data = requests.get(base_url, query_params).content
+        data = json.loads(data)
+
+        df = pd.DataFrame(data['data'])
+        df['Name'] = df['Name'].apply(extract_text_from_html)
+        df['Team'] = df['Team'].apply(extract_text_from_html)
+
+        return df