Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement new FanGraphs Leaderboard API #398

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/fangraphs.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
| team | str | Team to filter data by. <br> Specify "0,ts" to get aggregate team data.
| position | str | Position to filter data by. <br> Default = ALL
| max_results | int | The maximum number of results to return. <br> Default = 1000000 (In effect, all results)
| legacy | bool | Flag to select whether to get data from legacy leaderboard page or not <br> Default = False


## Usage
Expand Down
38 changes: 22 additions & 16 deletions pybaseball/datasources/fangraphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
stat_list_from_str, stat_list_to_str)
from .html_table_processor import HTMLTableProcessor, RowIdFunction

_FG_LEADERS_URL = "/leaders-legacy.aspx"
_FG_LEADERS_URL = "/api/leaders/major-league/data"
_LEGACY_FG_LEADERS_URL = "/leaders-legacy.aspx"

MIN_AGE = 0
MAX_AGE = 100
Expand Down Expand Up @@ -43,6 +44,7 @@ class FangraphsDataTable(ABC):
DATA_ROWS_XPATH: str = "({TABLE_XPATH}/tbody//tr)"
DATA_CELLS_XPATH: str = "td[position()>1]/descendant-or-self::*/text()"
QUERY_ENDPOINT: str = _FG_LEADERS_URL
LEGACY_QUERY_ENDPOINT: str = _LEGACY_FG_LEADERS_URL
STATS_CATEGORY: FangraphsStatsCategory = FangraphsStatsCategory.NONE
DEFAULT_STAT_COLUMNS: List[FangraphsStatColumn] = []
KNOWN_PERCENTAGES: List[str] = []
Expand Down Expand Up @@ -77,7 +79,7 @@ def fetch(self, start_season: int, end_season: Optional[int] = None, league: str
stat_columns: Union[str, List[str]] = 'ALL', qual: Optional[int] = None, split_seasons: bool = True,
month: str = 'ALL', on_active_roster: bool = False, minimum_age: int = MIN_AGE,
maximum_age: int = MAX_AGE, team: str = '', _filter: str = '', players: str = '',
position: str = 'ALL', max_results: int = 1000000,) -> pd.DataFrame:
position: str = 'ALL', max_results: int = 1000000, legacy: bool = False) -> pd.DataFrame:

"""
Get leaderboard data from Fangraphs.
Expand Down Expand Up @@ -146,22 +148,26 @@ def fetch(self, start_season: int, end_season: Optional[int] = None, league: str
'age': f"{minimum_age},{maximum_age}",
'filter': _filter,
'players': players,
'page': f'1_{max_results}'
'page': f'1_{max_results}',
'pageitems': max_results # New Fangraphs Leaderboard uses pageitems to get maximum results per page
}

return self._validate(
self._postprocess(
self.html_accessor.get_tabular_data_from_options(
self.QUERY_ENDPOINT,
query_params=url_options,
# TODO: Remove the type: ignore after this is fixed: https://github.com/python/mypy/issues/5485
column_name_mapper=self.COLUMN_NAME_MAPPER, # type: ignore
known_percentages=self.KNOWN_PERCENTAGES,
row_id_func=self.ROW_ID_FUNC,
row_id_name=self.ROW_ID_NAME,
)
)
)
# Add `legacy` flag to let users decide whether use legacy api or not
tabular_data = self.html_accessor.get_tabular_data_from_options(
self.LEGACY_QUERY_ENDPOINT,
query_params=url_options,
# TODO: Remove the type: ignore after this is fixed: https://github.com/python/mypy/issues/5485
column_name_mapper=self.COLUMN_NAME_MAPPER, # type: ignore
known_percentages=self.KNOWN_PERCENTAGES,
row_id_func=self.ROW_ID_FUNC,
row_id_name=self.ROW_ID_NAME,
) if legacy else self.html_accessor.get_tabular_data_from_api(
f"{self.ROOT_URL}{self.QUERY_ENDPOINT}",
query_params=url_options
)


return self._validate(self._postprocess(tabular_data))

class FangraphsBattingStatsTable(FangraphsDataTable):
STATS_CATEGORY: FangraphsStatsCategory = FangraphsStatsCategory.BATTING
Expand Down
20 changes: 20 additions & 0 deletions pybaseball/datasources/html_table_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import lxml.etree
import pandas as pd
import requests
import json
import re

from ..datahelpers import postprocessing
from ..datahelpers.column_mapper import ColumnListMapperFunction
Expand Down Expand Up @@ -95,3 +97,21 @@ def get_tabular_data_from_options(self, base_url: str, query_params: Dict[str, U
row_id_func=row_id_func,
row_id_name=row_id_name,
)

def get_tabular_data_from_api(self, base_url: str, query_params: Dict[str, Union[str, int]]):
# Newest Fangraphs Leaderboard API will return html tag in `Name` and `Team` column
# Therefore we need to extract the name and team from response result
def extract_text_from_html(text):
try:
return re.search('>(.+?)<', text).group(1)
except AttributeError:
return text

data = requests.get(base_url, query_params).content
data = json.loads(data)

df = pd.DataFrame(data['data'])
df['Name'] = df['Name'].apply(extract_text_from_html)
df['Team'] = df['Team'].apply(extract_text_from_html)

return df
Loading