Source code for floodlight.io.statsperform

import os.path
import warnings
from typing import Dict, Tuple, Union
from pathlib import Path

import numpy as np
import pandas as pd
from lxml import etree

from floodlight.io.utils import download_from_url, get_and_convert
from floodlight.core.code import Code
from floodlight.core.events import Events
from floodlight.core.pitch import Pitch
from floodlight.core.teamsheet import Teamsheet
from floodlight.core.xy import XY
from floodlight.settings import DATA_DIR


# ----------------------------- StatsPerform Open Format -------------------------------


def _create_metadata_from_open_csv_df(
    csv_df: pd.DataFrame,
) -> Tuple[Dict[int, tuple], Pitch]:
    """Creates meta information from a pd.DataFrame that results from parsing the open
    StatsPerform event data CSV file.

    Parameters
    ----------
    csv_df: pd.DataFrame
        Data Frame with the parsed event data CSV file.

    Returns
    -------
    periods: Dict[int, int]
        Dictionary with start and endframes:
            ``periods[segment] = (startframe, endframe)``.
    pitch: Pitch
        Playing Pitch object.
    """

    # create pitch
    pi_len = csv_df["pitch_dimension_long_side"].values[0]
    pi_wid = csv_df["pitch_dimension_short_side"].values[0]
    pitch = Pitch.from_template(
        "statsperform_open",
        length=pi_len,
        width=pi_wid,
        sport="football",
    )

    # create periods for segments, coded as jumps in the frame sequence
    periods = {}
    frame_values = csv_df["frame_count"].unique()

    seg_idx = np.where(np.diff(frame_values, prepend=frame_values[0]) > 1)
    seg_idx = np.insert(seg_idx, 0, 0)
    seg_idx = np.append(seg_idx, len(frame_values))
    for segment in range(len(seg_idx) - 1):
        start = int(frame_values[seg_idx[segment]])
        end = int(frame_values[seg_idx[segment + 1] - 1])
        periods[segment] = (start, end)

    return periods, pitch


def _read_open_event_csv_single_line(
    line: str,
) -> Tuple[Dict, str, str]:
    """Extracts all relevant information from a single line of StatsPerform's Event csv
    file (i.e. one single event in the data).

    Parameters
    ----------
    line: str
        One full line from StatsPerform's Event CSV file.

    Returns
    -------
    event: Dict
        Dictionary with relevant event information in the form:
        ``event[attribute] = value``.
    """
    event = {}
    attrib = line.split(sep=",")

    # description
    event["eID"] = attrib[5].replace(" ", "")

    # relative time
    event["gameclock"] = float(attrib[4])
    event["frameclock"] = float(attrib[2])

    # segment, player and team
    segment = attrib[3]
    team = attrib[9]
    event["tID"] = team
    event["pID"] = attrib[8]

    # outcome
    event["outcome"] = np.nan
    if "Won" in attrib[5].split(" "):
        event["outcome"] = 1
    elif "Lost" in attrib[5].split(" "):
        event["outcome"] = 0

    # minute and second of game
    event["minute"] = np.floor(event["gameclock"] / 60)
    event["second"] = np.floor(event["gameclock"] - event["minute"] * 60)

    # additional information (qualifier)
    event["qualifier"] = {
        "event_id": attrib[1],
        "event_type_id": attrib[6],
        "sequencenumber": attrib[7],
        "jersey_no": attrib[10],
        "is_pass": attrib[11],
        "is_cross": attrib[12],
        "is_corner": attrib[13],
        "is_free_kick": attrib[14],
        "is_goal_kick": attrib[15],
        "passtypeid": attrib[16],
        "wintypeid": attrib[17],
        "savetypeid": attrib[18],
        "possessionnumber": attrib[19],
    }

    return event, team, segment



[docs]
def read_teamsheets_from_open_data_csv(
    filepath_csv: Union[str, Path]
) -> Dict[str, Teamsheet]:
    """Parses the entire open StatsPerform position data CSV file for unique jIDs
    (jerseynumbers) and creates teamsheets for both teams.

    Parameters
    ----------
    filepath_csv: str or pathlib.Path
        CSV file containing either open position or open event data.

    Returns
    -------
    teamsheets: Dict[str, Teamsheet]
        Dictionary with teamsheets for the home team and the away team.

    Notes
    -----
    Statsperform open data does not contain any player names. Thus, the teamsheet
    objects generated by this method will name players 'Player i' with i starting at 1.
    To identify players, use the jersey numbers of players or provide custom teamsheets
    generated by a different parser if Statsperform open data is used in combination
    with other data providers.
    """
    # read dat-file into pd.DataFrame
    csv_df = pd.read_csv(str(filepath_csv))

    # initialize team and ball ids
    team_ids = {"Home": 1.0, "Away": 2.0}
    ball_id = 4

    # check for additional tIDs
    for tID in csv_df["team_id"].unique():
        if not (tID in team_ids.values() or tID == ball_id or np.isnan(tID)):
            warnings.warn(
                f"tID {tID} did not match any of the standard tIDs "
                f"({team_ids.values()}) or the ball ID ({ball_id})!"
            )

    # initialize teamsheets
    teamsheets = {
        "Home": pd.DataFrame(columns=["player", "jID", "pID", "tID"]),
        "Away": pd.DataFrame(columns=["player", "jID", "pID", "tID"]),
    }

    # loop over teams
    for team in team_ids:
        # extract list with pID and jID information for all players in the team
        team_id = team_ids[team]
        team_df = csv_df[csv_df["team_id"] == team_id]
        jIDs = team_df["jersey_no"].unique()
        pIDs = [
            team_df[team_df["jersey_no"] == jID]["player_id"].unique() for jID in jIDs
        ]
        # possible check for multiple pIDs assigned to a single jID

        # insert data to teamsheet
        teamsheets[team]["player"] = [f"Player {i}" for i in range(len(pIDs))]
        teamsheets[team]["jID"] = [jID for jID in jIDs]
        teamsheets[team]["pID"] = [pID[0] for pID in pIDs]
        teamsheets[team]["tID"] = team_id

    # create teamsheet objects
    for team in teamsheets:
        teamsheets[team] = Teamsheet(teamsheets[team])

    return teamsheets




[docs]
def read_open_event_data_csv(
    filepath_events: Union[str, Path],
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]:
    """Parses an open StatsPerform Match Event CSV file and extracts the event data and
    teamsheets.

    This function provides high-level access to the particular openly published
    StatsPerform match events CSV file (e.g. for the Pro Forum '22) and returns Event
    objects for both teams.

    Parameters
    ----------
    filepath_events: str or pathlib.Path
        Full path to xml File where the Event data in StatsPerform csv format is
        saved
    teamsheet_home: Teamsheet, optional
        Teamsheet-object for the home team. If given as None (default), teamsheet is
        extracted from the event data CSV file.
    teamsheet_away: Teamsheet, optional
        Teamsheet-object for the away team. If given as None (default), teamsheet is
        extracted from the event data CSV file.

    Returns
    -------
    data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]
        Tuple of (nested) floodlight core objects with shape (events_objects,
        teamsheets).

        ``events_objects`` is a nested dictionary containing ``Events`` objects for
        each team and segment of the form ``events_objects[segment][team] = Events``.
        For a typical league match with two halves and teams this dictionary looks like:
        ``{'1': {'Home': Events, 'Away': Events}, '2': {'Home': Events, 'Away': Events}
        }``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

    Notes
    -----
    StatsPerform's open format of handling provides certain additional event attributes,
    which attach additional information to certain events. As of now, these information
    are parsed as a string in the ``qualifier`` column of the returned DataFrame and can
    be transformed to a dict of form ``{attribute: value}``.
    """
    # initialize bin and variables
    events = {}
    team_ids = {"Home": 1.0, "Away": 2.0}
    segments = ["1", "2"]
    for team in team_ids.values():
        events[team] = {segment: pd.DataFrame() for segment in segments}

    # create or check teamsheet objects
    if teamsheet_home is None and teamsheet_away is None:
        teamsheets = read_teamsheets_from_open_data_csv(filepath_events)
        teamsheet_home = teamsheets["Home"]
        teamsheet_away = teamsheets["Away"]
    elif teamsheet_home is None:
        teamsheets = read_teamsheets_from_open_data_csv(filepath_events)
        teamsheet_home = teamsheets["Home"]
    elif teamsheet_away is None:
        teamsheets = read_teamsheets_from_open_data_csv(filepath_events)
        teamsheet_away = teamsheets["Away"]
    else:
        pass
        # potential check

    # parse event data
    with open(str(filepath_events), "r") as f:
        while True:
            line = f.readline()

            # terminate if at end of file
            if len(line) == 0:
                break

            # skip the head
            if line.split(sep=",")[3] == "current_phase":
                continue

            # read single line
            event, team, segment = _read_open_event_csv_single_line(line)

            # insert to bin
            if team:
                team = float(team)
                events[team][segment] = pd.concat(
                    [events[team][segment], pd.DataFrame([event])], ignore_index=True
                )
            else:  # if no clear assignment possible, insert to bins for both teams
                for team in team_ids.values():
                    events[team][segment] = pd.concat(
                        [events[team][segment], pd.DataFrame([event])],
                        ignore_index=True,
                    )

    # create objects
    events_objects = {}
    for segment in segments:
        events_objects[segment] = {}
        for team in ["Home", "Away"]:
            events_objects[segment][team] = Events(
                events=pd.DataFrame(data=events[team_ids[team]][segment]),
            )
    teamsheets = {
        "Home": teamsheet_home,
        "Away": teamsheet_away,
    }

    # pack objects
    data_objects = (events_objects, teamsheets)

    return data_objects




[docs]
def read_open_position_data_csv(
    filepath_position: Union[str, Path],
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Code], Dict[str, Teamsheet], Pitch]:
    """Parses an open StatsPerform CSV file and extract position data and possession
    codes as well as teamsheets and pitch information.

    Openly published StatsPerform position data (e.g. for the Pro Forum '22) is stored
    in a CSV file containing all position data (for both halves) as well as information
    about players, the pitch, and the ball possession. This function provides high-level
    access to StatsPerform data by parsing the CSV file.

    Parameters
    ----------
    filepath_position: str or pathlib.Path
        Full path to the CSV file.
    teamsheet_home: Teamsheet, optional
        Teamsheet-object for the home team used to create link dictionaries of the form
        `links[team][jID] = xID`. The links are used to map players to a specific xID in
        the respective XY objects. Should be supplied for custom ordering. If given as
        None (default), teamsheet is extracted from the open StatsPerform CSV file and
        its xIDs are assigned in order of appearance.
    teamsheet_away: Teamsheet, optional
        Teamsheet-object for the away team. If given as None (default), teamsheet is
        extracted from the Match Information XML file. See teamsheet_home for details.

    Returns
    -------
    data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Code], \
     Dict[str, Teamsheet], Pitch]
        Tuple of (nested) floodlight core objects with shape (xy_objects,
        possession_objects, teamsheets, pitch).

        ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
        and segment of the form ``xy_objects[segment][team] = XY``. For a typical
        league match with two halves and teams this dictionary looks like:
        ``{0: {'Home': XY, 'Away': XY}, 1: {'Home': XY, 'Away': XY}}``.

        ``possession_objects`` is a dictionary containing ``Code`` objects with
        possession information (home or away) for each segment of the form
        ``possession_objects[segment] = Code``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

        ``pitch`` is a ``Pitch`` object corresponding to the data.
    """
    # parse the CSV file into pd.DataFrame
    dat_df = pd.read_csv(str(filepath_position))

    # initialize team and ball ids
    team_ids = {"Home": 1.0, "Away": 2.0}
    ball_id = 4

    # check for additional tIDs
    for ID in dat_df["team_id"].unique():
        if not (ID in team_ids.values() or ID == ball_id):
            warnings.warn(f"Team ID {ID} did not match any of the standard IDs!")

    # create or check teamsheet objects
    if teamsheet_home is None and teamsheet_away is None:
        teamsheets = read_teamsheets_from_open_data_csv(filepath_position)
        teamsheet_home = teamsheets["Home"]
        teamsheet_away = teamsheets["Away"]
    elif teamsheet_home is None:
        teamsheets = read_teamsheets_from_open_data_csv(filepath_position)
        teamsheet_home = teamsheets["Home"]
    elif teamsheet_away is None:
        teamsheets = read_teamsheets_from_open_data_csv(filepath_position)
        teamsheet_away = teamsheets["Away"]
    else:
        pass
        # potential check

    # create links
    if "xID" not in teamsheet_home.teamsheet.columns:
        teamsheet_home.add_xIDs()
    if "xID" not in teamsheet_away.teamsheet.columns:
        teamsheet_away.add_xIDs()
    links_jID_to_xID = {}
    links_jID_to_xID["Home"] = teamsheet_home.get_links("jID", "xID")
    links_jID_to_xID["Away"] = teamsheet_away.get_links("jID", "xID")

    # create periods and pitch
    periods, pitch = _create_metadata_from_open_csv_df(dat_df)
    segments = list(periods.keys())

    # infer data shapes
    number_of_players = {team: len(links_jID_to_xID[team]) for team in links_jID_to_xID}
    number_of_frames = {}
    for segment in segments:
        start = periods[segment][0]
        end = periods[segment][1]
        number_of_frames[segment] = end - start + 1

    # bins
    codes = {"possession": {segment: [] for segment in segments}}
    xydata = {
        "Home": {
            segment: np.full(
                [
                    number_of_frames[segment],
                    number_of_players[list(links_jID_to_xID.keys())[0]] * 2,
                ],
                np.nan,
            )
            for segment in periods
        },
        "Away": {
            segment: np.full(
                [
                    number_of_frames[segment],
                    number_of_players[list(links_jID_to_xID.keys())[1]] * 2,
                ],
                np.nan,
            )
            for segment in periods
        },
        "Ball": {
            segment: np.full([number_of_frames[segment], 2], np.nan)
            for segment in periods
        },
    }

    # loop
    for segment in segments:

        # teams
        for team in team_ids:
            team_df = dat_df[dat_df["team_id"] == team_ids[team]]
            for pID in team_df["player_id"].unique():
                # extract player information
                pl_df = team_df[team_df["player_id"] == pID]
                frames = pl_df["frame_count"].values
                x_position = pl_df["pos_x"].values
                y_position = pl_df["pos_y"].values

                # compute appearance of player in segment
                appearance = np.array(
                    [
                        (periods[segment][0] <= frame <= periods[segment][-1])
                        for frame in frames
                    ]
                )
                # check for players that did not play in segment
                if not np.sum(appearance):
                    continue

                # insert player position to bin array
                jrsy = int(pl_df["jersey_no"].values[0])
                x_col = (links_jID_to_xID[team][jrsy] - 1) * 2
                y_col = (links_jID_to_xID[team][jrsy] - 1) * 2 + 1
                start = frames[appearance][0] - periods[segment][0]
                end = frames[appearance][-1] - periods[segment][0] + 1
                xydata[team][segment][start:end, x_col] = x_position[appearance]
                xydata[team][segment][start:end, y_col] = y_position[appearance]

        # ball
        ball_df = dat_df[dat_df["team_id"] == 4]
        frames = ball_df["frame_count"].values
        appearance = np.array(
            [(periods[segment][0] <= frame <= periods[segment][-1]) for frame in frames]
        )
        xydata["Ball"][segment][:, 0] = ball_df["pos_x"].values[appearance]
        xydata["Ball"][segment][:, 1] = ball_df["pos_y"].values[appearance]

        # update codes
        codes["possession"][segment] = ball_df["possession"].values[appearance]

    # create objects
    xy_objects = {}
    possession_objects = {}
    for segment in segments:
        xy_objects[segment] = {}
        possession_objects[segment] = Code(
            code=codes["possession"][segment],
            name="possession",
            definitions=dict([(team_id, team) for team, team_id in team_ids.items()]),
            framerate=10,
        )
        for team in ["Home", "Away", "Ball"]:
            xy_objects[segment][team] = XY(
                xy=xydata[team][segment],
                framerate=10,
            )
    teamsheets = {
        "Home": teamsheet_home,
        "Away": teamsheet_away,
    }

    # pack objects
    data_objects = (
        xy_objects,
        possession_objects,
        teamsheets,
        pitch,
    )

    return data_objects



# ----------------------------- StatsPerform Format ---------------------------


def _read_position_data_txt_single_line(
    line: str,
) -> Tuple[
    int,
    int,
    Dict[str, Dict[str, Tuple[float, float, float]]],
    Dict[str, Union[str, tuple]],
]:
    """Extracts all relevant information from a single line of StatsPerform's position
    data TXT file (i.e. one frame of data).

    Parameters
    ----------
    line: str
        One full line from StatsPerform's .txt-file, equals one sample of data.

    Returns
    -------
    gameclock: int
        The gameclock of the current segment in milliseconds.
    segment: int
        The segment identifier.
    positions: Dict[str, Dict[str, Tuple[float, float, float]]]
        Nested dictionary that stores player position information for each team and
        player. Has the form ``positions[team][jID] = (x, y)``.
    ball: Dict[str]
        Dictionary with ball information. Has keys 'position', 'possession' and
        'ballstatus'.
    """
    # bins
    positions = {"Home": {}, "Away": {}, "Other": {}}
    ball = {}

    # read chunks
    chunks = line.split(":")
    time_chunk = chunks[0]
    player_chunks = chunks[1].split(";")

    ball_chunk = None
    if len(chunks) > 2:  # check if ball information exist in chunk
        ball_chunk = chunks[2]

    # time chunk
    # systemclock = time_chunk.split(";")[0]
    # possible check or synchronization step
    timeinfo = time_chunk.split(";")[1].split(",")
    gameclock = int(timeinfo[0])
    segment = int(timeinfo[1])
    # ballstatus = timeinfo[2].split(":")[0] == '0'  # '0' seems to be always the case?

    # player chunks
    for player_chunk in player_chunks:

        # skip final entry of chunk
        if not player_chunk or player_chunk == "\n":
            continue

        # read team
        chunk_data = player_chunk.split(",")
        if chunk_data[0] in ["0", "3"]:
            team = "Home"
        elif chunk_data[0] in ["1", "4"]:
            team = "Away"
        else:
            team = "Other"

        # read IDs
        # pID = chunk_data[1]
        jID = chunk_data[2]

        # read positions
        x, y = map(lambda x: float(x), chunk_data[3:])

        # assign
        positions[team][jID] = (x, y)

    # ball chunk
    if ball_chunk is not None:
        x, y, z = map(lambda x: float(x), ball_chunk.split(";")[0].split(","))
        # ball["position"] = (x, y, z)  # z-coordinate is not yet supported
        ball["position"] = (x, y)

    return gameclock, segment, positions, ball


def _read_time_information_from_position_data_txt(
    filepath_position: Union[str, Path],
) -> Tuple[Dict, Union[int, None]]:
    """Reads StatsPerform's position TXT file and extracts information about the first
    and last frame of periods. Also, a framerate is estimated from the
    gameclock difference between samples.

    Parameters
    ----------
    filepath_position: str or pathlib.Path
        Full path to the TXT file containing the position data.

    Returns
    -------
    periods: Dict
        Dictionary with start and endframes:
        ``periods[segment] = [startframe, endframe]``.
    framerate_est: int or None
        Estimated temporal resolution of data in frames per second/Hertz.
    """

    # bins
    startframes = {}
    endframes = {}
    framerate_est = None

    # read TXT file from disk
    file_txt = open(filepath_position, "r")

    # loop
    last_gameclock = None
    last_segment = None
    for line in file_txt.readlines():

        # read gameclock and segment
        gameclock, segment, _, _ = _read_position_data_txt_single_line(line)

        # update periods
        if segment not in startframes:
            startframes[segment] = gameclock
            if last_gameclock is not None:
                endframes[last_segment] = last_gameclock

        # estimate framerate if desired
        if last_gameclock is not None:
            delta = np.absolute(gameclock - last_gameclock)  # in milliseconds
            if framerate_est is None:
                framerate_est = int(1000 / delta)
            elif framerate_est != int(1000 / delta) and last_segment == segment:
                warnings.warn(
                    f"Framerate estimation yielded diverging results."
                    f"The originally estimated framerate of {framerate_est} Hz did not "
                    f"match the current estimation of {int(1000 / delta)} Hz. This "
                    f"might be caused by missing frame(s) in the position data. "
                    f"Continuing by choosing the latest estimation of "
                    f"{int(1000 / delta)} Hz"
                )
                framerate_est = int(1000 / delta)

        # update variables
        last_gameclock = gameclock
        last_segment = segment

    # update end of final segment
    endframes[last_segment] = last_gameclock

    # assembly
    periods = {
        segment: (startframes[segment], endframes[segment]) for segment in startframes
    }

    # close file
    file_txt.close()

    return periods, framerate_est


def _read_jersey_numbers_from_position_data_txt(
    file_location_txt: Union[str, Path],
) -> Tuple[set, set]:
    """Reads StatsPerform's position TXT file and extracts unique set of jIDs
    (jerseynumbers) for both teams.

    Parameters
    ----------
    file_location_txt: str or pathlib.Path
        Full path to the TXT file containing the position data.

    Returns
    -------
    home_jIDs: set
    away_jIDs: set
    """

    # bins
    home_jIDs = set()
    away_jIDs = set()

    # read TXT file from disk
    file_txt = open(file_location_txt, "r")

    # loop
    for package in file_txt.readlines():

        # read line
        _, _, positions, _ = _read_position_data_txt_single_line(package)

        # extract jersey numbers
        home_jIDs |= set(positions["Home"].keys())
        away_jIDs |= set(positions["Away"].keys())

    # close file
    file_txt.close()

    return home_jIDs, away_jIDs



[docs]
def read_teamsheets_from_event_data_xml(
    filepath_events: Union[str, Path],
) -> Dict[str, Teamsheet]:
    """Parses the StatsPerform event file and returns two Teamsheet-objects with
    detailed player information for the home and the away team.

    Parameters
    ----------
    filepath_events: str or pathlib.Path
        Full path to the XML file containing the event data.

    Returns
    -------
    teamsheets: Dict[str, Teamsheet]
        Dictionary with teamsheets for the home team and the away team.
    """
    # load event data xml tree into memory
    tree = etree.parse(str(filepath_events))
    root = tree.getroot()

    # initialize teamsheets
    teamsheets = {
        "Home": pd.DataFrame(
            columns=["player", "position", "team_name", "jID", "pID", "tID", "started"]
        ),
        "Away": pd.DataFrame(
            columns=["player", "position", "team_name", "jID", "pID", "tID", "started"]
        ),
    }

    # parse player information
    for team_matchsheet in root.findall("MatchSheet/Team"):

        # skip referees
        if team_matchsheet.attrib["Type"] == "Referees":
            continue

        # read team
        team = team_matchsheet.attrib["Type"][:-4]  # cut 'Team' of e.g. 'HomeTeam'
        tID = team_matchsheet.attrib["IdTeam"]
        team_name = team_matchsheet.attrib["Name"]

        # find players
        players = [
            actor
            for actor in team_matchsheet.findall("Actor")
            if actor.attrib["Occupation"] == "Player"
        ]

        # create teamsheet
        teamsheets[team]["player"] = [
            get_and_convert(player, "NickName", str) for player in players
        ]
        teamsheets[team]["pID"] = [
            get_and_convert(player, "IdActor", int) for player in players
        ]
        teamsheets[team]["jID"] = [
            get_and_convert(player, "JerseyNumber", int) for player in players
        ]
        teamsheets[team]["position"] = [
            get_and_convert(player, "Position", str) for player in players
        ]
        teamsheets[team]["started"] = [
            player.get("IsStarter") == "True" for player in players
        ]
        teamsheets[team]["tID"] = tID
        teamsheets[team]["team_name"] = team_name

    # create teamsheet objects
    for team in teamsheets:
        teamsheets[team] = Teamsheet(teamsheets[team])

    return teamsheets




[docs]
def read_teamsheets_from_position_data_txt(
    filepath_position: Union[str, Path],
) -> Dict[str, Teamsheet]:
    """Parses the StatsPerform position file and returns two simple Teamsheet-objects
    containing only two columns "player" and "jID" for the home and the away team.

    Parameters
    ----------
    filepath_position: str or pathlib.Path
        Full path to the TXT file containing the position data.

    Returns
    -------
    teamsheets: Dict[str, Teamsheet]
        Dictionary with teamsheets for the home team and the away team.
    """
    # create list of jIDs
    homejrsy, awayjrsy = _read_jersey_numbers_from_position_data_txt(filepath_position)
    homejrsy = list(homejrsy)
    awayjrsy = list(awayjrsy)
    homejrsy.sort()
    awayjrsy.sort()
    jIDs = {
        "Home": homejrsy,
        "Away": awayjrsy,
    }

    # create teamsheets
    teamsheets = {
        "Home": pd.DataFrame(columns=["player", "jID"]),
        "Away": pd.DataFrame(columns=["player", "jID"]),
    }
    for team in teamsheets:
        teamsheets[team]["player"] = [f"Player {i}" for i in range(len(jIDs[team]))]
        teamsheets[team]["jID"] = [int(jID) for jID in jIDs[team]]

    # create teamsheet objects
    for team in teamsheets:
        teamsheets[team] = Teamsheet(teamsheets[team])

    return teamsheets




[docs]
def read_event_data_xml(
    filepath_events: Union[str, Path],
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]:
    """Parses a StatsPerform XML file and extracts event data and pitch information.

    This function provides high-level access to the StatsPerform match events XML file
    and returns Events objects for both teams and information about the pitch.

    Parameters
    ----------
    filepath_events: str or pathlib.Path
        Full path to the XML file containing the event data.
    teamsheet_home: Teamsheet, optional
        Teamsheet-object for the home team used to create link dictionaries of the form
        `links[pID] = team`. The links are used to map players to the home and away
        teams. If given as None (default), teamsheet is extracted from the event data
        XML file.
    teamsheet_away: Teamsheet, optional
        Teamsheet-object for the away team. If given as None (default), teamsheet is
        extracted from the event data XML file. See teamsheet_home for details.

    Returns
    -------
    data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]
        Tuple of (nested) floodlight core objects with shape (events_objects,
        teamsheets, pitch).

        ``events_objects`` is a nested dictionary containing ``Events`` objects for
        each team and segment of the form ``events_objects[segment][team] = Events``.
        For a typical league match with two halves and teams this dictionary looks like:
        ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
        Events}}``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

        ``pitch`` is a ``Pitch`` object corresponding to the data.
    """
    # load xml tree into memory
    tree = etree.parse(str(filepath_events))
    root = tree.getroot()

    # create bins, read segments, and assign teams
    columns = [
        "eID",
        "gameclock",
        "pID",
        "minute",
        "second",
        "at_x",
        "at_y",
        "to_x",
        "to_y",
        "qualifier",
    ]
    segments = [
        f"HT{get_and_convert(period.attrib, 'IdHalf', str)}"
        for period in root.findall("Events/EventsHalf")
    ]
    teams = ["Home", "Away"]

    # create or check teamsheet objects
    if teamsheet_home is None and teamsheet_away is None:
        teamsheets = read_teamsheets_from_event_data_xml(filepath_events)
        teamsheet_home = teamsheets["Home"]
        teamsheet_away = teamsheets["Away"]
    elif teamsheet_home is None:
        teamsheets = read_teamsheets_from_event_data_xml(filepath_events)
        teamsheet_home = teamsheets["Home"]
    elif teamsheet_away is None:
        teamsheets = read_teamsheets_from_event_data_xml(filepath_events)
        teamsheet_away = teamsheets["Away"]
    else:
        pass
        # potential check

    # create links between pIDs and team
    links_pID_to_team = {}
    links_pID_to_team.update({pID: "Home" for pID in teamsheet_home["pID"]})
    links_pID_to_team.update({pID: "Away" for pID in teamsheet_away["pID"]})

    # bins
    event_lists = {
        team: {segment: {col: [] for col in columns} for segment in segments}
        for team in teams
    }

    # loop over events
    for half in root.findall("Events/EventsHalf"):
        # get segment information
        period = get_and_convert(half.attrib, "IdHalf", str)
        segment = "HT" + str(period)
        for event in half.findall("Event"):
            # read pID
            pID = get_and_convert(event.attrib, "IdActor1", int)

            # assign team
            team = get_and_convert(links_pID_to_team, pID, str)

            # create list of either a single team or both teams if no clear assignment
            if team == "None":
                teams_assigned = teams  # add to both teams
            else:
                teams_assigned = [team]  # only add to one team

            # identifier
            eID = get_and_convert(event.attrib, "EventName", str)
            for team in teams_assigned:
                event_lists[team][segment]["eID"].append(eID)
                event_lists[team][segment]["pID"].append(pID)

            # relative time
            gameclock = get_and_convert(event.attrib, "Time", int) / 1000
            minute = np.floor(gameclock / 60)
            second = np.floor(gameclock - minute * 60)
            for team in teams_assigned:
                event_lists[team][segment]["gameclock"].append(gameclock)
                event_lists[team][segment]["minute"].append(minute)
                event_lists[team][segment]["second"].append(second)

            # location
            at_x = get_and_convert(event.attrib, "LocationX", float)
            at_y = get_and_convert(event.attrib, "LocationY", float)
            to_x = get_and_convert(event.attrib, "TargetX", float)
            to_y = get_and_convert(event.attrib, "TargetY", float)
            for team in teams_assigned:
                event_lists[team][segment]["at_x"].append(at_x)
                event_lists[team][segment]["at_y"].append(at_y)
                event_lists[team][segment]["to_x"].append(to_x)
                event_lists[team][segment]["to_y"].append(to_y)

            # qualifier
            qual_dict = {}
            for qual_id in event.attrib:
                qual_value = event.attrib.get(qual_id)
                qual_dict[qual_id] = qual_value
            for team in teams_assigned:
                event_lists[team][segment]["qualifier"].append(str(qual_dict))

    # create pitch
    length = get_and_convert(root.attrib, "FieldLength", int) / 100
    width = get_and_convert(root.attrib, "FieldWidth", int) / 100
    pitch = Pitch.from_template(
        "statsperform_event",
        length=length,
        width=width,
        sport="football",
    )

    # create objects
    events_objects = {}
    for segment in segments:
        events_objects[segment] = {}
        for team in ["Home", "Away"]:
            events_objects[segment][team] = Events(
                events=pd.DataFrame(data=event_lists[team][segment]),
            )
    teamsheets = {
        "Home": teamsheet_home,
        "Away": teamsheet_away,
    }

    # pack objects
    data_objects = (events_objects, teamsheets, pitch)

    return data_objects




[docs]
def read_position_data_txt(
    filepath_position: Union[str, Path],
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]:
    """Parses a StatsPerform TXT file and extracts position data and teamsheets.

     Internal StatsPerform position data is stored as a TXT file containing all
     position data (for both halves). This function provides high-level access to
     StatsPerform data by parsing the TXT file. Since no information about framerate is
     delivered in the data itself, it is estimated from time difference between
     individual frames. Teamsheets are extracted from the event data, if filepath_events
     is provided. Otherwise, minimal Teamsheet-objects are inferred from the position
     data.

    Parameters
    ----------
    filepath_position: str or pathlib.Path
        Full path to the TXT file containing the position data.
    teamsheet_home: Teamsheet, optional
        Teamsheet-object for the home team used to create link dictionaries of the form
        `links[team][jID] = xID`. The links are used to map players to a specific xID in
        the respective XY objects. Should be supplied for custom ordering. If given as
        None (default), teamsheet is extracted from the position data TXT file and its
        xIDs are assigned in order of appearance.
    teamsheet_away: Teamsheet, optional
        Teamsheet-object for the away team. If given as None (default), teamsheet is
        extracted from the position data TXT file. See teamsheet_home for details.

    Returns
    -------
    data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]
        Tuple of (nested) floodlight core objects with shape (xy_objects,
        teamsheets).

        ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
        and segment of the form ``xy_objects[segment][team] = XY``. For a typical
        league match with two halves and teams this dictionary looks like:
        ``{1: {'Home': XY, 'Away': XY}, 2: {'Home': XY, 'Away': XY}}``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

    Notes
    -----
    Statsperform position data does not contain any player information expect jersey
    numbers by default. Thus, the teamsheet objects generated by this method will name
    players 'Player i' with i starting at 1. To identify players, use the jersey numbers
    of players or provide custom teamsheets (e.g. by parsing teamsheets from the
    Statsperform event data or another data provider).
    """
    # parse TXT file for periods and estimate framerate if not contained in filepath
    periods, framerate_est = _read_time_information_from_position_data_txt(
        filepath_position
    )
    segments = list(periods.keys())

    # create or check teamsheet objects
    if teamsheet_home is None and teamsheet_away is None:
        teamsheets = read_teamsheets_from_position_data_txt(filepath_position)
        teamsheet_home = teamsheets["Home"]
        teamsheet_away = teamsheets["Away"]
    elif teamsheet_home is None:
        teamsheets = read_teamsheets_from_position_data_txt(filepath_position)
        teamsheet_home = teamsheets["Home"]
    elif teamsheet_away is None:
        teamsheets = read_teamsheets_from_position_data_txt(filepath_position)
        teamsheet_away = teamsheets["Away"]
    else:
        pass
        # potential check

    # create links
    if "xID" not in teamsheet_home.teamsheet.columns:
        teamsheet_home.add_xIDs()
    if "xID" not in teamsheet_away.teamsheet.columns:
        teamsheet_away.add_xIDs()
    links_jID_to_xID = {}
    links_jID_to_xID["Home"] = teamsheet_home.get_links("jID", "xID")
    links_jID_to_xID["Away"] = teamsheet_away.get_links("jID", "xID")

    # infer data array shapes
    number_of_home_players = max(links_jID_to_xID["Home"].values()) + 1
    number_of_away_players = max(links_jID_to_xID["Away"].values()) + 1
    number_of_frames = {}
    for segment in segments:
        number_of_frames[segment] = (
            int((periods[segment][1] - periods[segment][0]) / 1000 * framerate_est) + 1
        )

    # bins
    xydata = {}
    xydata["Home"] = {
        segment: np.full(
            [number_of_frames[segment], number_of_home_players * 2], np.nan
        )
        for segment in segments
    }
    xydata["Away"] = {
        segment: np.full(
            [number_of_frames[segment], number_of_away_players * 2], np.nan
        )
        for segment in segments
    }
    xydata["Ball"] = {
        segment: np.full([number_of_frames[segment], 2], np.nan) for segment in segments
    }

    # read TXT file from disk
    with open(filepath_position, "r") as f:
        tracking_data_lines = f.readlines()

    # loop
    for package in tracking_data_lines:

        # read line to get gameclock, player positions and ball info
        (
            gameclock,
            segment,
            positions,
            ball,
        ) = _read_position_data_txt_single_line(package)

        # check if frame is in any segment
        if segment is None:
            # skip line if not
            continue
        else:
            # otherwise calculate relative frame (in respective segment)
            frame_rel = int((gameclock - periods[segment][0]) / 1000 * framerate_est)

        # insert (x,y)-data into np.array
        for team in ["Home", "Away"]:
            for jID in positions[team].keys():

                # map jersey number to array index and infer respective columns
                x_col = (links_jID_to_xID[team][int(jID)] - 1) * 2
                y_col = (links_jID_to_xID[team][int(jID)] - 1) * 2 + 1
                xydata[team][segment][frame_rel, x_col] = positions[team][jID][0]
                xydata[team][segment][frame_rel, y_col] = positions[team][jID][1]

        # get ball data
        xydata["Ball"][segment][frame_rel] = ball.get("position", np.nan)

    # create objects
    xy_objects = {}
    for segment in segments:
        xy_objects[segment] = {}
        for team in ["Home", "Away", "Ball"]:
            xy_objects[segment][team] = XY(
                xy=xydata[team][segment],
                framerate=framerate_est,
            )
    teamsheets = {
        "Home": teamsheet_home,
        "Away": teamsheet_away,
    }

    # pack objects
    data_objects = (
        xy_objects,
        teamsheets,
    )

    return data_objects




[docs]
def read_event_data_from_url(
    url: str,
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]:
    """Reads a URL containing a StatsPerform events CSV file and extracts the stored
    event data, pitch information, and teamsheets.

    The event data from the URL is downloaded into a temporary file stored in the
    repository's internal root ``.data``-folder and removed afterwards.

    Parameters
    ----------
    url: str
        URL to the XML file containing the event data.
    teamsheet_home: Teamsheet, optional
        Teamsheet-object for the home team used to create link dictionaries of the form
        `links[pID] = team`. The links are used to map players to the home and away
        teams. If given as None (default), teamsheet is extracted from the event data
        XML file.
    teamsheet_away: Teamsheet, optional
        Teamsheet-object for the away team. If given as None (default), teamsheet is
        extracted from the event data XML file. See teamsheet_home for details.

    Returns
    -------
    data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]
        Tuple of (nested) floodlight core objects with shape (events_objects,
        teamsheets, pitch).

        ``events_objects`` is a nested dictionary containing ``Events`` objects for
        each team and segment of the form ``events_objects[segment][team] = Events``.
        For a typical league match with two halves and teams this dictionary looks like:
        ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
        Events}}``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

        ``pitch`` is a ``Pitch`` object corresponding to the data.
    """
    data_dir = os.path.join(DATA_DIR, "statsperform")
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir, exist_ok=True)
    temp_file = os.path.join(data_dir, "events_temp.xml")
    with open(temp_file, "wb") as binary_file:
        binary_file.write(download_from_url(url))

    events_objects, teamsheets, pitch = read_event_data_xml(
        filepath_events=os.path.join(data_dir, temp_file),
        teamsheet_home=teamsheet_home,
        teamsheet_away=teamsheet_away,
    )
    data_objects = (events_objects, teamsheets, pitch)
    os.remove(os.path.join(data_dir, temp_file))

    return data_objects




[docs]
def read_position_data_from_url(
    url: str,
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]:
    """Reads a URL from the StatsPerform API (StatsEdgeViewer) containing a position
    data TXT file and extracts position data and teamsheets.

    The position data from the URL is downloaded into a temporary file stored in the
    repository's internal root ``.data``-folder and removed afterwards.

    Parameters
    ----------
    url: str or pathlib.Path
        URL to the TXT file containing the position data.
    teamsheet_home: Teamsheet, optional
        Teamsheet-object for the home team used to create link dictionaries of the form
        `links[team][jID] = xID`. The links are used to map players to a specific xID in
        the respective XY objects. Should be supplied for custom ordering. If given as
        None (default), teamsheet is extracted from the position data TXT file and its
        xIDs are assigned in order of appearance.
    teamsheet_away: Teamsheet, optional
        Teamsheet-object for the away team. If given as None (default), teamsheet is
        extracted from the position data TXT file. See teamsheet_home for details.

    Returns
    -------
    data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]
        Tuple of (nested) floodlight core objects with shape (xy_objects,
        teamsheets).

        ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
        and segment of the form ``xy_objects[segment][team] = XY``. For a typical
        league match with two halves and teams this dictionary looks like:
        ``{1: {'Home': XY, 'Away': XY}, 2: {'Home': XY, 'Away': XY}}``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

    Notes
    -----
    Statsperform position data does not contain any player information expect jersey
    numbers by default. Thus, the teamsheet objects generated by this method will name
    players 'Player i' with i starting at 1. To identify players, use the jersey numbers
    of players or provide custom teamsheets (e.g. by parsing teamsheets from the
    Statsperform event data or another data provider).
    """
    data_dir = os.path.join(DATA_DIR, "statsperform")
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir, exist_ok=True)
    temp_file = os.path.join(data_dir, "tracking_temp.txt")
    with open(temp_file, "wb") as binary_file:
        binary_file.write(download_from_url(url))

    xy_objects, teamsheets = read_position_data_txt(
        filepath_position=os.path.join(data_dir, temp_file),
        teamsheet_home=teamsheet_home,
        teamsheet_away=teamsheet_away,
    )
    data_objects = (xy_objects, teamsheets)
    os.remove(os.path.join(data_dir, temp_file))

    return data_objects