Skip to content

[SoFIFA] Add Fetch Icon #816

@rekvizitt

Description

@rekvizitt

Is your feature request related to a problem? Please describe.
Currently, the soccerdata library does not support fetching and saving team icons when retrieving team ratings from SoFIFA. This limits the ability to visually represent teams in applications or analyses that use this data.

Describe the solution you'd like
I propose adding functionality to fetch and save team icons when retrieving team ratings from SoFIFA. This would involve:

  1. Modifying the read_team_ratings method to extract the image URL from the HTML response.
  2. Adding a new method get_image to handle downloading and saving the image.
  3. Storing the path to the saved image in the returned DataFrame for easy access.

Describe alternatives you've considered

  1. Manual Download: Users could manually download images, but this is time-consuming and error-prone.
  2. External Libraries: Using external libraries to scrape images, but integrating this directly into soccerdata provides a more seamless user experience.
  3. API Integration: If SoFIFA provides an API for images, integrating it could be more efficient, but this requires API access and documentation.

Additional context
Here is the modified code snippet that implements the proposed feature:

import os
import requests

def read_team_ratings(self) -> pd.DataFrame:
    """Retrieve ratings for all teams in the selected leagues.
    Returns
    -------
    pd.DataFrame
    """
    # Define id and description of ratings to retrieve
    ratings = {
        "oa": "overall",
        "at": "attack",
        "md": "midfield",
        "df": "defence",
        "tb": "transfer_budget",
        "cw": "club_worth",
        "bs": "build_up_speed",
        "bd": "build_up_dribbling",
        "bp": "build_up_passing",
        "bps": "build_up_positioning",
        "cc": "chance_creation_crossing",
        "cp": "chance_creation_passing",
        "cs": "chance_creation_shooting",
        "cps": "chance_creation_positioning",
        "da": "defence_aggression",
        "dm": "defence_pressure",
        "dw": "defence_team_width",
        "dd": "defence_defender_line",
        "dp": "defence_domestic_prestige",
        "ip": "international_prestige",
        "ps": "players",
        "sa": "starting_xi_average_age",
        "ta": "whole_team_average_age",
    }

    # Build URL
    urlmask = SO_FIFA_API + "/teams?lg={}&r={}&set=true"
    for rating_id in ratings:
        urlmask += f"&showCol[]={rating_id}"
    filemask = "teams_{}_{}.html"

    # Get league IDs
    leagues = self.read_leagues()

    # Collect teams
    teams = []
    iterator = list(product(leagues.iterrows(), self.versions.iterrows()))
    for i, ((lkey, league), (version_id, version)) in enumerate(iterator):
        logger.info(
            "[%s/%s] Retrieving teams for %s in %s edition",
            i + 1,
            len(iterator),
            lkey,
            version["update"],
        )
        league_id = league["league_id"]

        # Read HTML page (league overview)
        filepath = self.data_dir / filemask.format(league_id, version_id)
        url = urlmask.format(league_id, version_id)
        reader = self.get(url, filepath)

        # Extract team links
        tree = html.parse(reader)
        for node in tree.xpath("//table/tbody/tr"):
            # Extract team name
            team_name_node = node.xpath(".//td[2]//a")
            if not team_name_node:
                continue  # Skip rows without a valid team name

            team_name = team_name_node[0].text.strip() if team_name_node else None

            # Extract ratings safely
            team_data = {"league": lkey, "team": team_name, **version.to_dict()}
            for key, desc in ratings.items():
                value_nodes = node.xpath(f".//td[@data-col='{key}']//text()")
                team_data[desc] = value_nodes[0].strip() if value_nodes else None

            # Extract image URL
            img_node = node.xpath(".//td[1]//img")
            if img_node:
                img_url = img_node[0].get("data-src") or img_node[0].get("src")
                if img_url:
                    img_filename = os.path.join(self.data_dir, f"{team_name}.png")
                    self.get_image(img_url, img_filename)
                    team_data["image_path"] = img_filename

            teams.append(team_data)

    # Return DataFrame
    return (
        pd.DataFrame(teams)
        .replace({"team": TEAMNAME_REPLACEMENTS})
        .set_index(["league", "team"])
        .sort_index()
    )

def get_image(self, url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)

my test code:

import soccerdata as sd
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import os

logging.basicConfig(level=logging.INFO)

leagues = ["ENG-Premier League"]
seasons = ["2024-25"]

current_dir = Path.cwd()
data_dir = current_dir / 'test_data'
raw_data_dir = data_dir / 'raw'
sofifa_data_dir = data_dir / 'soFIFA'

raw_data_dir.mkdir(parents=True, exist_ok=True)
sofifa_data_dir.mkdir(parents=True, exist_ok=True)

sofifa = sd.SoFIFA(leagues=leagues, versions='latest', data_dir=sofifa_data_dir)

logging.info("Fetching team ratings from SoFIFA...")
try:
    sofifa_ratings = sofifa.read_team_ratings()
    logging.info("SoFIFA team ratings fetched successfully.")
except Exception as e:
    logging.error(f"Error fetching SoFIFA data: {e}")

sofifa_ratings.to_json(raw_data_dir / 'sofifa_ratings.json', orient='records', indent=4)

Result:

Image

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions