Source code for floodlight.io.datasets

import json
import os
from typing import Tuple, Dict
from urllib.error import HTTPError, URLError

import h5py
import numpy as np
import pandas as pd

from floodlight.io.utils import extract_zip, download_from_url
from floodlight.io.statsbomb import (
    read_open_event_data_json,
    read_teamsheets_from_open_event_data_json,
)
from floodlight import XY, Pitch, Events, Code
from floodlight.io.dfl import read_event_data_xml, read_position_data_xml
from floodlight.core.teamsheet import Teamsheet
from floodlight.settings import DATA_DIR



[docs]
class EIGDDataset:
    """This dataset loads the EIGD-H data from the *A Unified Taxonomy and Multimodal
    Dataset for Events in Invasion Games* paper. [1]_

    Upon instantiation, the class checks if the data already exists in the repository's
    root ``.data``-folder, and will download the files (~120MB) to this folder if not.

    Parameters
    ----------
    dataset_dir_name: str, optional
        Name of subdirectory where the dataset is stored within the root .data
        directory. Defaults to 'eigd_dataset'.

    Notes
    -----
    The dataset contains a total of 25 short samples of spatiotemporal data for both
    teams and the ball from the German Men's Handball Bundesliga (HBL). For more
    information, visit the
    `official project repository <https://github.com/MM4SPA/eigd>`_.
    Data for one sample can be queried calling the :func:`~EIGDDataset.get`-method
    specifying the match and segment. The following matches and segments are
    available::

        matches = ['48dcd3', 'ad969d', 'e0e547', 'e8a35a', 'ec7a6a']
        segments = {
            '48dcd3': ['00-06-00', '00-15-00', '00-25-00', '01-05-00', '01-10-00'],
            'ad969d': ['00-00-30', '00-15-00', '00-43-00', '01-11-00', '01-35-00'],
            'e0e547': ['00-00-00', '00-08-00', '00-15-00', '00-50-00', '01-00-00'],
            'e8a35a': ['00-02-00', '00-07-00', '00-14-00', '01-05-00', '01-14-00'],
            'ec7a6a': ['00-30-00', '00-53-00', '01-19-00', '01-30-00', '01-40-00'],
        }

    Examples
    --------
    >>> from floodlight.io.datasets import EIGDDataset

    >>> dataset = EIGDDataset()
    # get one sample
    >>> teamA, teamB, ball = dataset.get(match_name="48dcd3", segment="00-06-00")
    # get the corresponding pitch
    >>> pitch = dataset.get_pitch()


    References
    ----------
        .. [1] `Biermann, H., Theiner, J., Bassek, M., Raabe, D., Memmert, D., & Ewerth,
            R. (2021, October). A Unified Taxonomy and Multimodal Dataset for Events in
            Invasion Games. In Proceedings of the 4th International Workshop on
            Multimedia Content Analysis in Sports (pp. 1-10).
            <https://dl.acm.org/doi/abs/10.1145/3475722.3482792>`_
    """

    def __init__(self, dataset_dir_name="eigd_dataset"):
        self._EIGD_SCHEMA = "https"
        self._EIGD_BASE_URL = (
            "data.uni-hannover.de/dataset/8ccb364e-145f-4b28-8ff4-954b86e9b30d/"
            "resource/fd24e032-742d-4609-9052-cec310a2a563/download"
        )
        self._EIGD_FILENAME = "eigd-h_pos.zip"
        self._EIGD_HOST_URL = (
            f"{self._EIGD_SCHEMA}://{self._EIGD_BASE_URL}/{self._EIGD_FILENAME}"
        )
        self._EIGD_FILE_EXT = "h5"
        self._EIGD_FRAMERATE = 30

        self._data_dir = os.path.join(DATA_DIR, dataset_dir_name)

        if not os.path.isdir(self._data_dir):
            os.makedirs(self._data_dir, exist_ok=True)
        if not bool(os.listdir(self._data_dir)):
            self._download_and_extract()


[docs]
    def get(
        self, match_name: str = "48dcd3", segment: str = "00-06-00"
    ) -> Tuple[XY, XY, XY]:
        """Get one sample from the EIGD dataset.

        Parameters
        ----------
        match_name : str, optional
            Match name, check Notes section for valid arguments.
            Defaults to the first match ("48dcd3").
        segment : str, optional
            Segment identifier, check Notes section for valid arguments.
            Defaults to the first segment ("00-06-00").

        Returns
        -------
        sample: Tuple[XY, XY, XY]
            Returns three XY objects of the form (teamA, teamB, ball)
            for the requested sample.
        """
        file_name = os.path.join(
            self._data_dir, f"{match_name}_{segment}.{self._EIGD_FILE_EXT}"
        )

        if not os.path.isfile(file_name):
            raise FileNotFoundError(
                f"Could not load file, check class description for valid match "
                f"and segment values ({file_name})."
            )

        # extract from file
        with h5py.File(file_name) as h5f:
            pos_dict = {pos_set: positions[()] for pos_set, positions in h5f.items()}

        # assemble
        sample = (
            XY(xy=self._transform(pos_dict["team_a"]), framerate=self._EIGD_FRAMERATE),
            XY(xy=self._transform(pos_dict["team_b"]), framerate=self._EIGD_FRAMERATE),
            XY(xy=self._transform(pos_dict["balls"]), framerate=self._EIGD_FRAMERATE),
        )

        return sample



[docs]
    @staticmethod
    def get_pitch() -> Pitch:
        """Returns a Pitch object corresponding to the EIGD-data."""
        return Pitch.from_template("eigd")


    @staticmethod
    def _transform(data: np.ndarray) -> np.ndarray:
        """Transforms spatiotemporal data from EIGD-format to floodlight format.

        Parameters
        ----------
        data: np.ndarray
            Array of shape (T, N, xydim), with T time dimension, N the number of players
            and xydim a separate dimension for x-, y-, and z-coordinates (ball only).

        Returns
        -------
        data_transformed: np.ndarray
            Array of shape (T, N*2), with T time dimension and N the number of players.
            All z-coordinates are omitted to match typical floodlight format.
        """
        # EIGD data is stored in 3-dimensional array, extract size and reshape
        T, N, _ = data.shape
        data_transformed = data[:, :, :2].reshape((T, N * 2))

        return data_transformed

    def _download_and_extract(self) -> None:
        """Downloads an archive file into temporary storage and
        extracts the content to the file system.
        """
        file = f"{DATA_DIR}/eigd.zip"
        with open(file, "wb") as binary_file:
            binary_file.write(download_from_url(self._EIGD_HOST_URL))
        extract_zip(file, self._data_dir)
        os.remove(file)




[docs]
class ToyDataset:
    """This dataset loads synthetic data for a (very) short artificial football match.

    The data can be used for testing or trying out features. They come shipped with the
    package and are stored in the repository's root ``.data``-folder.

    Examples
    --------
    >>> from floodlight.io.datasets import ToyDataset

    >>> dataset = ToyDataset()
    # get one sample
    >>> (
    >>>     xy_home,
    >>>     xy_away,
    >>>     xy_ball,
    >>>     events_home,
    >>>     events_away,
    >>>     possession,
    >>>     ballstatus,
    >>> ) = dataset.get(segment="HT1")
    # get the corresponding pitch
    >>> pitch = dataset.get_pitch()

    """

    def __init__(self):
        self._TOY_FRAMERATE = 5
        self._TOY_DIRECTIONS = {
            "HT1": {"Home": "rl", "Away": "lr"},
            "HT2": {"Home": "lr", "Away": "rl"},
        }
        self._data_dir = os.path.join(DATA_DIR, "toy_dataset")


[docs]
    def get(
        self, segment: str = "HT1"
    ) -> Tuple[XY, XY, XY, Events, Events, Code, Code]:
        """Get data objects for one segment from the toy dataset.

        Parameters
        ----------
        segment : {'HT1', 'HT2'}, optional
            Segment identifier for the first ("HT1", default)) or the second ("HT2")
            half.

        Returns
        -------
        toy_dataset:  Tuple[XY, XY, XY, Events, Events, Code, Code]
            Returns seven core objects of the form (xy_home, xy_away, xy_ball,
            events_home, events_away, possession, ballstatus) for the requested segment.
        """

        if segment not in ["HT1", "HT2"]:
            raise FileNotFoundError(
                f"Expected segment to be one of 'HT1' or 'HT2', got {segment}."
            )

        xy_home = XY(
            xy=np.load(os.path.join(self._data_dir, f"xy_home_{segment.lower()}.npy")),
            framerate=self._TOY_FRAMERATE,
            direction=self._TOY_DIRECTIONS[segment]["Home"],
        )

        xy_away = XY(
            xy=np.load(os.path.join(self._data_dir, f"xy_away_{segment.lower()}.npy")),
            framerate=self._TOY_FRAMERATE,
            direction=self._TOY_DIRECTIONS[segment]["Away"],
        )

        xy_ball = XY(
            xy=np.load(os.path.join(self._data_dir, f"xy_ball_{segment.lower()}.npy")),
            framerate=self._TOY_FRAMERATE,
        )

        events_home = Events(
            events=pd.read_csv(
                os.path.join(self._data_dir, f"events_home_{segment.lower()}.csv")
            )
        )

        events_away = Events(
            events=pd.read_csv(
                os.path.join(self._data_dir, f"events_away_{segment.lower()}.csv")
            )
        )

        possession = Code(
            code=np.load(
                os.path.join(self._data_dir, f"possession_{segment.lower()}.npy")
            ),
            name="possession",
            definitions={1: "Home", 2: "Away"},
            framerate=self._TOY_FRAMERATE,
        )

        ballstatus = Code(
            code=np.load(
                os.path.join(self._data_dir, f"ballstatus_{segment.lower()}.npy")
            ),
            name="ballstatus",
            definitions={0: "Dead", 1: "Alive"},
            framerate=self._TOY_FRAMERATE,
        )

        data_objects = (
            xy_home,
            xy_away,
            xy_ball,
            events_home,
            events_away,
            possession,
            ballstatus,
        )

        return data_objects



[docs]
    @staticmethod
    def get_pitch() -> Pitch:
        """Returns a Pitch object corresponding to the Toy Dataset."""
        return Pitch(
            xlim=(-52.5, 52.5),
            ylim=(-34, 34),
            unit="m",
            boundaries="flexible",
            length=105,
            width=68,
            sport="football",
        )





[docs]
class StatsBombOpenDataset:
    """This dataset loads the StatsBomb open data provided by the `official
    data repository <https://github.com/statsbomb/open-data>`_.

    Due to the size of the full dataset (~5GB), only metadata (~2MB) are downloaded
    to the repository's root ``.data``-folder upon instantiation while the other data
    are only downloaded on demand. All downloaded files stay on disk if not manually
    removed.

    Parameters
    ----------
    dataset_dir_name: str, optional
        Name of subdirectory where the dataset is stored within the root .data
        directory. Defaults to 'statsbomb_dataset'.

    Notes
    -----
    The dataset contains results, lineups, event data, and (partly) `StatsBomb360 data
    <https://statsbomb.com/articles/soccer/
    statsbomb-360-freeze-frame-viewer-a-new-release-in-statsbomb-iq/>`_ for a variety
    of matches from a total of eight different competitions (Women's World Cup,
    FIFA World Cup, UEFA Euro, Champions League, FA Women's Super League, NWSL,
    Premier League, and La Liga).
    The Champions League data for example contains all Finals from 2003/2004 to
    2018/2019.
    The La Liga data contains every one of the 520 matches ever played by Lionel Messi
    for FC Barcelona.
    The UEFA Euro data contains 51 matches where StatsBomb360 data is available.
    As the data is constantly updated, we provide an overview over the stats here but
    refer to the official repository for up-to-date information (last
    checked 20.08.2022)::

        number_of_matches = {
            "Champions League": {
                '1999/2000' : 0, '2003/2004' : 1, '2004/2005' : 1, '2006/2007' : 1,
                '2008/2009' : 1, '2009/2010' : 1, '2010/2011' : 1, '2011/2012' : 1,
                '2012/2013' : 1, '2013/2014' : 1, '2014/2015' : 1, '2015/2016' : 1,
                '2016/2017' : 1, '2017/2018' : 1, '2018/2019' : 1,
                },
            "FA Women's Super League": {
                '2018/2019' : 108, '2019/2020' : 87, '2020/2021' : 131,
                },
            "FIFA World Cup": {
                '2018' : 64,
                },
            "La Liga": {
                '2004/2005': 7, '2005/2006' : 17, '2006/2007' : 26, '2007/2008' : 28,
                '2008/2009' : 31, '2009/2010' : 35, '2010/2011' : 33, '2011/2012' : 37,
                '2012/2013' : 32, '2013/2014' : 31, '2014/2015' : 38, '2015/2016' : 33,
                '2016/2017' : 34, '2017/2018' : 36, '2018/2019' : 34, '2019/2020' : 33,
                '2020/2021' : 35,
                },
            "NWSL": {
                '2018' : 36,
                },
            "Premier League": {
                '2003/2004' : 33,
                },
            "UEFA Euro" : {
                '2020' : 51,
                },
            "Women's World Cup": {
                '2019' : 52,
                },
        }

    Examples
    --------
    >>> from floodlight.io.datasets import StatsBombOpenDataset
    >>> dataset = StatsBombOpenDataset()
    # get one sample of event data with StatsBomb360 data
    >>> events, teamsheets = dataset.get("UEFA Euro", "2020", "England vs. Germany")
    # get the corresponding pitch
    >>> pitch = dataset.get_pitch()
    # get a summary of available matches in the dataset
    >>> matches = dataset.available_matches
    # extract every La Liga Clásico played in Camp Nou by Lionel Messi
    >>> clasicos = matches[matches["match_name"] == "Barcelona vs. Real Madrid"]
    # print outcomes
    >>> for _, match in clasicos.iterrows():
    >>>     print(f"Season {match['season_name']} - Barcelona {match['score']} Real'")
    # read events to list
    >>> clasico_events = []
    >>> for _, clasico in clasicos.iterrows():
    >>>     data = dataset.get("La Liga", clasico["season_name"], clasico["match_name"])
    >>>     clasico_events.append(data)

    """

    def __init__(self, dataset_dir_name="statsbomb_dataset"):
        # setup
        self._links_competition_to_cID = {}
        self._links_season_to_sID = {}
        self._links_match_to_mID = {}
        self._STATSBOMB_SCHEMA = "https"
        self._STATSBOMB_BASE_URL = (
            "raw.githubusercontent.com/statsbomb/open-data/master/data"
        )
        self._STATSBOMB_COMPETITIONS_FILENAME = "competitions"
        self._STATSBOMB_MATCHES_FOLDERNAME = "matches"
        self._STATSBOMB_EVENTS_FOLDERNAME = "events"
        self._STATSBOMB_THREESIXTY_FOLDERNAME = "three-sixty"
        self._STATSBOMB_FILE_EXT = ".json"

        # create data directory and check if competition info needs to be downloaded
        self._data_dir = os.path.join(DATA_DIR, dataset_dir_name)
        self.filepath_competitions = os.path.join(
            self._data_dir,
            self._STATSBOMB_COMPETITIONS_FILENAME + self._STATSBOMB_FILE_EXT,
        )
        if not os.path.isdir(self._data_dir):
            os.makedirs(self._data_dir, exist_ok=True)
        if not os.path.exists(self.filepath_competitions):
            self._download_competition_info()

        # create matches directory and check if match info needs to be downloaded
        self._matches_data_dir = os.path.join(
            self._data_dir, self._STATSBOMB_MATCHES_FOLDERNAME
        )
        if not os.path.isdir(self._matches_data_dir):
            os.makedirs(self._matches_data_dir, exist_ok=True)
        self._download_matches_info()

        # create events location
        self._events_data_dir = os.path.join(
            self._data_dir, self._STATSBOMB_EVENTS_FOLDERNAME
        )
        if not os.path.isdir(self._events_data_dir):
            os.makedirs(self._events_data_dir, exist_ok=True)

        # create StatsBomb360 location
        self._threesixty_data_dir = os.path.join(
            self._data_dir, self._STATSBOMB_THREESIXTY_FOLDERNAME
        )
        if not os.path.isdir(self._threesixty_data_dir):
            os.makedirs(self._threesixty_data_dir, exist_ok=True)

        # read links from files and update class level dictionaries
        self._read_competition_links_from_file()

    @property
    def available_matches(self) -> pd.DataFrame:
        """Creates and returns a DataFrame with information for all available matches
        from the metadata that is downloaded upon instantiation.

        Returns
        -------
        summary: pd.DataFrame
            Table where the rows contain meta information of individual games such as
            ``competition_name``, ``season_name``, and ``match_name`` (in the format
            Home vs. Away), location of the match (``stadium`` and ``country``),
            ``sex`` of the players (female or male), the ``StatsBomb360_status``  and
            the final ``score``.
        """
        summary = pd.DataFrame()

        # loop over season and competition
        for competition in self._links_competition_to_cID:
            cID = self._links_competition_to_cID[competition]
            self._read_season_match_links_for_competition_from_files(competition)
            for season in self._links_season_to_sID[competition]:
                sID = self._links_season_to_sID[competition][season]

                # loop over matches
                filepath_matches = os.path.join(
                    os.path.join(self._matches_data_dir, str(cID)),
                    str(sID) + self._STATSBOMB_FILE_EXT,
                )
                with open(filepath_matches, "r", encoding="utf8") as f:
                    matchinfo_list = json.load(f)

                for info in matchinfo_list:
                    match_info = {
                        "competition_name": competition,
                        "season_name": season,
                        "match_name": f"{info['home_team']['home_team_name']} "
                        f"vs. "
                        f"{info['away_team']['away_team_name']}",
                        "score": f"{info['home_score']}:{info['away_score']}",
                        "stadium": (
                            info["stadium"]["name"] if "stadium" in info else None
                        ),
                        "country": (
                            info["stadium"]["country"]["name"]
                            if "stadium" in info
                            else None
                        ),
                        "sex": (
                            "f"
                            if competition
                            in ["FA Women's Super League", "NWSL", "Women's World Cup"]
                            else "m"
                        ),
                        "StatsBomb360_status": info["match_status_360"],
                        "cID": cID,
                        "sID": sID,
                        "mID": info["match_id"],
                    }
                    summary = pd.concat(
                        [summary, pd.DataFrame([match_info])], ignore_index=True
                    )

        return summary


[docs]
    def get(
        self,
        competition_name: str = "La Liga",
        season_name: str = "2020/2021",
        match_name: str = None,
        teamsheet_home: Teamsheet = None,
        teamsheet_away: Teamsheet = None,
    ) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]:
        """Get events and teamsheets from one match of the StatsBomb open dataset.

        If `StatsBomb360data <https://statsbomb.com/articles/soccer/
        statsbomb-360-freeze-frame-viewer-a-new-release-in-statsbomb-iq/>`_  are
        available, they are stored in the  ``qualifier`` column of the Events object.
        If the files are not contained in the repository's root ``.data`` folder they
        are downloaded to the folder and will be stored until removed by hand.

        Parameters
        ----------
        competition_name : str, optional
            Competition name for which the match is played, check Notes section for
            possible competitions. Defaults to "La Liga".
        season_name : str, optional
            Season name during which the match is played. For league matches use the
            format YYYY/YYYY and for international cup matches the format YYYY.
            Check Notes for available seasons of every competition.
            Defaults to "2020/2021".
        match_name: str, optional
            Match name relating to the available matches in the chosen competition and
            season. If equal to None (default), the first available match of the
            given competition and season is chosen.
        teamsheet_home: Teamsheet, optional
            Teamsheet-object for the home team used to create link dictionaries of the
            form `links[pID] = team`. If given as None (default), teamsheet is extracted
            from the data.
        teamsheet_away: Teamsheet, optional
            Teamsheet-object for the away team. If given as None (default), teamsheet is
            extracted from data.

        Returns
        -------
        data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]
            Tuple of (nested) floodlight core objects with shape (events_objects,
            teamsheets).

            ``events_objects`` is a nested dictionary containing ``Events`` objects for
            each team and segment of the form
            ``events_objects[segment][team] = Events``.
            For a typical league match with two halves and teams this dictionary looks
            like:
            ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
            Events}}``.

            ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each
            team of the form ``teamsheets[team] = Teamsheet``.
        """
        # get identifiers from links
        cID = self._links_competition_to_cID[competition_name]
        if competition_name not in self._links_season_to_sID:
            self._read_season_match_links_for_competition_from_files(competition_name)
        sID = self._links_season_to_sID[competition_name][season_name]
        matches_dict = self._links_match_to_mID[competition_name][season_name]
        if match_name is None:
            mID = list(matches_dict.values())[0]
        else:
            mID = matches_dict[match_name]

        # create paths
        filepath_matches = os.path.join(
            os.path.join(self._matches_data_dir, str(cID)),
            str(sID) + self._STATSBOMB_FILE_EXT,
        )
        filepath_events = os.path.join(
            self._events_data_dir,
            str(mID) + self._STATSBOMB_FILE_EXT,
        )
        filepath_threesixty = os.path.join(
            self._threesixty_data_dir,
            str(mID) + self._STATSBOMB_FILE_EXT,
        )

        # check if events need to be downloaded
        if not os.path.exists(filepath_events):
            events_host_url = (
                f"{self._STATSBOMB_SCHEMA}://"
                f"{self._STATSBOMB_BASE_URL}/"
                f"{self._STATSBOMB_EVENTS_FOLDERNAME}/"
                f"{str(mID)}"
                f"{self._STATSBOMB_FILE_EXT}"
            )
            with open(filepath_events, "wb") as binary_file:
                binary_file.write(download_from_url(events_host_url))

        # check if StatsBomb360 data is available and needs to be downloaded
        if not os.path.exists(filepath_threesixty):
            threesixty_host_url = (
                f"{self._STATSBOMB_SCHEMA}://"
                f"{self._STATSBOMB_BASE_URL}/"
                f"{self._STATSBOMB_THREESIXTY_FOLDERNAME}/"
                f"{str(mID)}"
                f"{self._STATSBOMB_FILE_EXT}"
            )
            try:
                data = download_from_url(threesixty_host_url)
                with open(filepath_threesixty, "wb") as binary_file:
                    binary_file.write(data)
            except HTTPError:
                filepath_threesixty = None

        # read events from file
        events_objects, teamsheets = read_open_event_data_json(
            filepath_events,
            filepath_matches,
            filepath_threesixty,
            teamsheet_home,
            teamsheet_away,
        )

        # assembly
        data_objects = (events_objects, teamsheets)

        return data_objects



[docs]
    def get_teamsheets(
        self,
        competition_name: str = "La Liga",
        season_name: str = "2020/2021",
        match_name: str = None,
    ) -> Dict[str, Teamsheet]:
        """Returns a dictionary with Teamsheet-objects for both teams ("Home" and
        "Away") from one match of the StatsBomb open dataset.

        Parameters
        ----------
        competition_name : str, optional
            Competition name for which the match is played, check Notes section for
            possible competitions. Defaults to "La Liga".
        season_name : str, optional
            Season name during which the match is played. For league matches use the
            format YYYY/YYYY and for international cup matches the format YYYY.
            Check Notes for available seasons of every competition.
            Defaults to "2020/2021".
        match_name: str, optional
            Match name relating to the available matches in the chosen competition and
            season. If equal to None (default), the first available match of the
            given competition and season is chosen.

        Returns
        -------
        teamsheets: Dict[str, Teamsheet]
            Teamsheet-objects for both teams ("Home" and "Away") of the given match.
        """

        # get identifiers from links
        cID = self._links_competition_to_cID[competition_name]
        if competition_name not in self._links_season_to_sID:
            self._read_season_match_links_for_competition_from_files(competition_name)
        sID = self._links_season_to_sID[competition_name][season_name]
        matches_dict = self._links_match_to_mID[competition_name][season_name]
        if match_name is None:
            mID = list(matches_dict.values())[0]
        else:
            mID = matches_dict[match_name]

        # create paths
        filepath_matches = os.path.join(
            os.path.join(self._matches_data_dir, str(cID)),
            str(sID) + self._STATSBOMB_FILE_EXT,
        )
        filepath_events = os.path.join(
            self._events_data_dir,
            str(mID) + self._STATSBOMB_FILE_EXT,
        )

        # check if events need to be downloaded
        if not os.path.exists(filepath_events):
            events_host_url = (
                f"{self._STATSBOMB_SCHEMA}://"
                f"{self._STATSBOMB_BASE_URL}/"
                f"{self._STATSBOMB_EVENTS_FOLDERNAME}/"
                f"{str(mID)}"
                f"{self._STATSBOMB_FILE_EXT}"
            )
            with open(filepath_events, "wb") as binary_file:
                binary_file.write(download_from_url(events_host_url))

        # read teamsheets from file
        teamsheets = read_teamsheets_from_open_event_data_json(
            filepath_events,
            filepath_matches,
        )

        return teamsheets



[docs]
    @staticmethod
    def get_pitch() -> Pitch:
        """Returns a Pitch-object corresponding to the StatsBomb Dataset."""
        return Pitch.from_template("statsbomb", sport="football")


    def _read_competition_links_from_file(self):
        """Writes the data links between the available competitions and the respective
        cIDs to the class level dictionary.
        """
        # updates on competition level
        competition_info = pd.read_json(self.filepath_competitions)
        cIDs = competition_info["competition_id"].unique()
        competitions = competition_info["competition_name"].unique()
        self._links_competition_to_cID.update(
            {competition: cIDs[i] for i, competition in enumerate(competitions)}
        )

    def _read_season_match_links_for_competition_from_files(self, competition_name):
        """Writes data links between the seasons and matches to the respective sIDs
        and mIDs for a given competition to the class level dictionaries.
        """
        # read competition file
        cID = self._links_competition_to_cID[competition_name]
        competition_info = pd.read_json(self.filepath_competitions)

        # update season and match dictionaries with competition information
        self._links_season_to_sID.update({competition_name: {}})
        self._links_match_to_mID.update({competition_name: {}})

        # loop over all available seasons of the given competition
        for _, single_season in competition_info.iterrows():
            if cID != single_season["competition_id"]:
                continue

            # update season and match dictionaries with season information
            sID = single_season["season_id"]
            season_name = single_season["season_name"]
            self._links_season_to_sID[competition_name].update({season_name: sID})
            self._links_match_to_mID[competition_name].update({season_name: {}})

            # read information of all matches within the season
            filepath_matches = os.path.join(
                os.path.join(self._matches_data_dir, str(cID)),
                str(sID) + self._STATSBOMB_FILE_EXT,
            )
            with open(filepath_matches, "rb") as matches_file:
                season_matches_info = json.load(matches_file)

            # update match dictionary with match information
            for info in season_matches_info:
                match_name = (
                    f"{info['home_team']['home_team_name']} vs. "
                    f"{info['away_team']['away_team_name']}"
                )
                mID = info["match_id"]
                self._links_match_to_mID[competition_name][season_name].update(
                    {match_name: mID}
                )

    def _download_competition_info(self) -> None:
        """Downloads json file with competition information into the file system."""
        competitions_host_url = (
            f"{self._STATSBOMB_SCHEMA}://"
            f"{self._STATSBOMB_BASE_URL}/"
            f"{self._STATSBOMB_COMPETITIONS_FILENAME}"
            f"{self._STATSBOMB_FILE_EXT}"
        )

        # download file with information of all seasons
        try:
            with open(self.filepath_competitions, "wb") as binary_file:
                binary_file.write(download_from_url(competitions_host_url))
        except URLError:  # remove empty json file if download fails
            os.remove(self.filepath_competitions)
            raise URLError(
                f"Could not download competitions.json from URL "
                f"{competitions_host_url}. Check your internet connection!"
            )

    def _download_matches_info(self) -> None:
        """Downloads the json files containing information about available matches from
        all available seasons into the file system.
        """
        competition_info = pd.read_json(self.filepath_competitions)

        for _, single_season in competition_info.iterrows():
            cID = single_season["competition_id"]
            sID = single_season["season_id"]
            matches_filepath = os.path.join(
                os.path.join(self._matches_data_dir, str(cID)),
                str(sID) + self._STATSBOMB_FILE_EXT,
            )
            if not os.path.exists(matches_filepath):
                season_host_url = (
                    f"{self._STATSBOMB_SCHEMA}://"
                    f"{self._STATSBOMB_BASE_URL}/"
                    f"{self._STATSBOMB_MATCHES_FOLDERNAME}/"
                    f"{str(cID)}/"
                    f"{str(sID)}"
                    f"{self._STATSBOMB_FILE_EXT}"
                )
                competition_data_dir = os.path.join(self._matches_data_dir, str(cID))
                if not os.path.isdir(competition_data_dir):
                    os.makedirs(competition_data_dir, exist_ok=True)
                season_file = os.path.join(
                    competition_data_dir, str(sID) + self._STATSBOMB_FILE_EXT
                )
                with open(season_file, "wb") as binary_file:
                    binary_file.write(download_from_url(season_host_url))




[docs]
class IDSSEDataset:
    """This dataset loads the accompanying data set from the *An integrated dataset of
    spatiotemporal and event data in elite soccer* paper. [2]_

    Upon instantiation, the class checks if the specified data already exists in the
    repository's root ``.data``-folder, and will download the files to this folder if
    not. The default setting is to load the first match from the dataset. However, any
    individual match or the entire dataset (~2.4 GB) can be downloaded.

    Parameters
    ----------
    dataset_dir_name: str, optional
        Name of subdirectory where the dataset is stored within the root .data
        directory. Defaults to 'idsse_dataset'.
    match_id: str, optional
        Match-ID of either one of the matches or 'all'. Defaults to 'J03WMX'. Setting it
        to one of the matches will download the data of this individual match, if it
        does not exist in the repository's root ``.data``-folder. Setting it to 'all'
        will download the data of all matches that do not exist in ``.data``.

    Notes
    -----
    The dataset contains seven full matches of raw event and position data for both
    teams and the ball from the German Men's Bundesliga season 2022/23 first and second
    division. A detailed description of the dataset as well as the collection process
    can be found in the accompanying paper. Data for one match can be queried calling
    the :func:`~IDSSEDataset.get`-method by specifying the match. The following matches
    are available::

        matches = {
        'J03WMX': 1. FC Köln vs. FC Bayern München,
        'J03WN1': VfL Bochum 1848 vs. Bayer 04 Leverkusen,
        'J03WPY': Fortuna Düsseldorf vs. 1. FC Nürnberg,
        'J03WOH': Fortuna Düsseldorf vs. SSV Jahn Regensburg,
        'J03WQQ': Fortuna Düsseldorf vs. FC St. Pauli,
        'J03WOY': Fortuna Düsseldorf vs. F.C. Hansa Rostock,
        'J03WR9': Fortuna Düsseldorf vs. 1. FC Kaiserslautern
        }

    Examples
    --------
    >>> from floodlight.io.datasets import IDSSEDataset

    >>> dataset = IDSSEDataset("J03WMX")
    # get one sample
    >>> events, xy, possession, ballstatus, teamsheets, pitch = dataset.get("J03WMX")
    # get the corresponding pitch
    >>> pitch = dataset.get_pitch()


    References
    ----------
        .. [2] `Bassek, M., Weber, H., Rein, R., & Memmert,D. (2024). An integrated
            dataset of spatiotemporal and event data in elite soccer. Scientific Data,
            12(195). <https://doi.org/10.1038/s41597-025-04505-y>`_
    """

    def __init__(self, dataset_dir_name="idsse_dataset", match_id="J03WMX"):
        self._IDSSE_SCHEMA = "https"
        self._IDSSE_BASE_URL = "ndownloader.figshare.com/files"
        self._IDSSE_FILE_IDS_INFO = {
            "J03WMX": "51643475",
            "J03WN1": "51643472",
            "J03WPY": "51643487",
            "J03WOH": "51643478",
            "J03WQQ": "51643484",
            "J03WOY": "51643481",
            "J03WR9": "51643490",
        }
        self._IDSSE_FILE_IDS_EVENT = {
            "J03WMX": "51643493",
            "J03WN1": "51643496",
            "J03WPY": "51643505",
            "J03WOH": "51643499",
            "J03WQQ": "51643508",
            "J03WOY": "51643502",
            "J03WR9": "51643511",
        }
        self._IDSSE_FILE_IDS_POSITION = {
            "J03WMX": "51643514",
            "J03WN1": "51643517",
            "J03WPY": "51643526",
            "J03WOH": "51643520",
            "J03WQQ": "51643529",
            "J03WOY": "51643523",
            "J03WR9": "51643532",
        }
        if match_id in self._IDSSE_FILE_IDS_INFO.keys():
            self._IDSSE_HOST_URL_INFO = (
                f"{self._IDSSE_SCHEMA}://"
                f"{self._IDSSE_BASE_URL}/"
                f"{self._IDSSE_FILE_IDS_INFO[match_id]}"
            )
            self._IDSSE_HOST_URL_EVENT = (
                f"{self._IDSSE_SCHEMA}://"
                f"{self._IDSSE_BASE_URL}/"
                f"{self._IDSSE_FILE_IDS_EVENT[match_id]}"
            )
            self._IDSSE_HOST_URL_POSITION = (
                f"{self._IDSSE_SCHEMA}://"
                f"{self._IDSSE_BASE_URL}/"
                f"{self._IDSSE_FILE_IDS_POSITION[match_id]}"
            )
        elif match_id == "all":
            pass
        else:
            raise ValueError(
                f"Expected match_id to be in {self._IDSSE_FILE_IDS_INFO.values()} or "
                f"`all`, got {match_id} instead."
            )
        self._IDSSE_FILE_EXT = "xml"

        self._data_dir = os.path.join(DATA_DIR, dataset_dir_name)

        if not os.path.isdir(self._data_dir):
            os.makedirs(self._data_dir, exist_ok=True)

        if match_id in self._IDSSE_FILE_IDS_INFO.keys():
            if match_id in ["J03WMX", "J03WN1"]:
                competition = "DFL-COM-000001"
            else:
                competition = "DFL-COM-000002"

            self._IDSSE_FILE_NAME_INFO = (
                f"DFL_02_01_matchinformation_"
                f"{competition}"
                f"_DFL-MAT-{match_id}."
                f"{self._IDSSE_FILE_EXT}"
            )
            self._IDSSE_FILE_NAME_EVENT = (
                f"DFL_03_02_events_raw_"
                f"{competition}_DFL-MAT-{match_id}."
                f"{self._IDSSE_FILE_EXT}"
            )
            self._IDSSE_FILE_NAME_POSITION = (
                f"DFL_04_03_positions_raw_observed_"
                f"{competition}_DFL-MAT-{match_id}."
                f"{self._IDSSE_FILE_EXT}"
            )

            if not os.path.isfile(f"{self._data_dir}/{self._IDSSE_FILE_NAME_INFO}"):
                self._download_and_write(
                    self._IDSSE_FILE_NAME_INFO, self._IDSSE_HOST_URL_INFO
                )
            if not os.path.isfile(f"{self._data_dir}/{self._IDSSE_FILE_NAME_EVENT}"):
                self._download_and_write(
                    self._IDSSE_FILE_NAME_EVENT, self._IDSSE_HOST_URL_EVENT
                )
            if not os.path.isfile(f"{self._data_dir}/{self._IDSSE_FILE_NAME_POSITION}"):
                self._download_and_write(
                    self._IDSSE_FILE_NAME_POSITION, self._IDSSE_HOST_URL_POSITION
                )
        elif match_id == "all":
            for file_id in self._IDSSE_FILE_IDS_INFO:
                if file_id in ["J03WMX", "J03WN1"]:
                    competition = "DFL-COM-000001"
                else:
                    competition = "DFL-COM-000002"
                self._IDSSE_HOST_URL_INFO = (
                    f"{self._IDSSE_SCHEMA}://"
                    f"{self._IDSSE_BASE_URL}/"
                    f"{self._IDSSE_FILE_IDS_INFO[file_id]}"
                )
                self._IDSSE_HOST_URL_EVENT = (
                    f"{self._IDSSE_SCHEMA}://"
                    f"{self._IDSSE_BASE_URL}/"
                    f"{self._IDSSE_FILE_IDS_EVENT[file_id]}"
                )
                self._IDSSE_HOST_URL_POSITION = (
                    f"{self._IDSSE_SCHEMA}://"
                    f"{self._IDSSE_BASE_URL}/"
                    f"{self._IDSSE_FILE_IDS_POSITION[file_id]}"
                )

                self._IDSSE_FILE_NAME_INFO = (
                    f"DFL_02_01_matchinformation_"
                    f"{competition}_DFL-MAT-{file_id}."
                    f"{self._IDSSE_FILE_EXT}"
                )
                self._IDSSE_FILE_NAME_EVENT = (
                    f"DFL_03_02_events_raw_{competition}"
                    f"_DFL-MAT-{file_id}."
                    f"{self._IDSSE_FILE_EXT}"
                )
                self._IDSSE_FILE_NAME_POSITION = (
                    f"DFL_04_03_positions_raw_observed_"
                    f"{competition}_DFL-MAT-{file_id}."
                    f"{self._IDSSE_FILE_EXT}"
                )

                if not os.path.isfile(f"{self._data_dir}/{self._IDSSE_FILE_NAME_INFO}"):
                    self._download_and_write(
                        self._IDSSE_FILE_NAME_INFO, self._IDSSE_HOST_URL_INFO
                    )
                if not os.path.isfile(
                    f"{self._data_dir}/{self._IDSSE_FILE_NAME_EVENT}"
                ):
                    self._download_and_write(
                        self._IDSSE_FILE_NAME_EVENT, self._IDSSE_HOST_URL_EVENT
                    )
                if not os.path.isfile(
                    f"{self._data_dir}/{self._IDSSE_FILE_NAME_POSITION}"
                ):
                    self._download_and_write(
                        self._IDSSE_FILE_NAME_POSITION, self._IDSSE_HOST_URL_POSITION
                    )

    def _download_and_write(self, file_name, host_url) -> None:
        """Downloads a text file into temporary storage and
        writes the content to the file system.
        """
        file = f"{self._data_dir}/{file_name}"
        response = download_from_url(host_url)
        with open(file, "wb") as binary_file:
            binary_file.write(response)
        binary_file.close()


[docs]
    @staticmethod
    def get_pitch() -> Pitch:
        """Returns a Pitch object corresponding to the IDSSE-data."""
        return Pitch.from_template("dfl", length=105, width=68)



[docs]
    def get(
        self,
        match_id: str = "J03WMX",
        teamsheet_home: Teamsheet = None,
        teamsheet_away: Teamsheet = None,
        events=True,
        positions=True,
    ) -> Tuple[
        Dict[str, Dict[str, Events]],
        Dict[str, Dict[str, XY]],
        Dict[str, Code],
        Dict[str, Code],
        Dict[str, Teamsheet],
        Pitch,
    ]:
        """Get event and position data from the IDSSE dataset.

        Parameters
        ----------
        match_id : str, optional
            Match name, check Notes section for valid arguments.
            Defaults to the first match "J03WMX".
        teamsheet_home: Teamsheet, optional
            Teamsheet-object for the home team used to create link dictionaries of the
                form `links[pID] = team`. If given as None (default), teamsheet is
                extracted from the data.
        teamsheet_away: Teamsheet, optional
            Teamsheet-object for the away team used to create link dictionaries of the
                form `links[pID] = team`. If given as None (default), teamsheet is
                extracted from the data.
        events: bool, optional
            Specifies whether the event data should be returned. Default is True. If
            false None will be returned instead of the events-objects.
        positions: bool, optional
            Specifies whether the position data should be returned. Default is True. If
            false None will be returned instead of the XY-objects, possession-objects,
            and ballstatus-objects. This will improve performance considerably if only
            event data is required.


        Returns
        -------
        match_data: Tuple[Dict[str, Dict[str, Events]], Dict[str, Dict[str, XY]],
        Dict[str, Code], Dict[str, Code], Dict[str, Teamsheet],Pitch]
            Returns a tuple of shape (events_objects, xy_objects, possession_objects,
            ballstatus_objects, teamsheets_objects, pitch_object) as returned by the
            ``floodlight.io.dfl.read_event_data_xml()`` and
            ``floodlight.io.dfl.read_position_data_xml()`` functions for the requested
            match. If any of the arguments ``events`` or ``positions`` are set to False,
            None is returned instead of `event_data` or `xy_objects`,
            `possession_objects`, and `ballstatus_objects`, respectively.
        """

        if match_id in ["J03WMX", "J03WN1"]:
            competition = "DFL-COM-000001"
        else:
            competition = "DFL-COM-000002"

        file_name_infos = os.path.join(
            self._data_dir,
            f"DFL_02_01_matchinformation_"
            f"{competition}_DFL-MAT-{match_id}."
            f"{self._IDSSE_FILE_EXT}",
        )

        file_name_events = os.path.join(
            self._data_dir,
            f"DFL_03_02_events_raw_"
            f"{competition}_DFL-MAT-{match_id}."
            f"{self._IDSSE_FILE_EXT}",
        )

        file_name_positions = os.path.join(
            self._data_dir,
            f"DFL_04_03_positions_raw_observed_"
            f"{competition}_DFL-MAT-{match_id}."
            f"{self._IDSSE_FILE_EXT}",
        )

        if not os.path.isfile(file_name_infos):
            raise FileNotFoundError(
                f"Could not load file, check class description for valid match "
                f"and segment values ({file_name_infos})."
            )

        # parse event data
        if events is True:
            events_objects, teamsheets_objects, pitch_object = read_event_data_xml(
                file_name_events, file_name_infos, teamsheet_home, teamsheet_away
            )
        else:
            events_objects, teamsheets_objects, pitch_object = (None, None, None)

        # parse position data
        if positions is True:
            (
                xy_objects,
                possession_objects,
                ballstatus_objects,
                teamsheets_objects,
                pitch_object,
            ) = read_position_data_xml(
                file_name_positions, file_name_infos, teamsheet_home, teamsheet_away
            )
        else:
            xy_objects, possession_objects, ballstatus_objects = (None, None, None)

        # assemble
        match_data = (
            events_objects,
            xy_objects,
            possession_objects,
            ballstatus_objects,
            teamsheets_objects,
            pitch_object,
        )

        return match_data