Source code for floodlight.io.statsperform

import os.path
import warnings
from typing import Dict, Tuple, Union
from pathlib import Path

import numpy as np
import pandas as pd
from lxml import etree

from floodlight.io.utils import download_from_url, get_and_convert
from floodlight.core.code import Code
from floodlight.core.events import Events
from floodlight.core.pitch import Pitch
from floodlight.core.teamsheet import Teamsheet
from floodlight.core.xy import XY
from floodlight.settings import DATA_DIR


# ----------------------------- StatsPerform Open Format -------------------------------


def _create_metadata_from_open_csv_df(
    csv_df: pd.DataFrame,
) -> Tuple[Dict[int, tuple], Pitch]:
    """Creates meta information from a pd.DataFrame that results from parsing the open
    StatsPerform event data CSV file.

    Parameters
    ----------
    csv_df: pd.DataFrame
        Data Frame with the parsed event data CSV file.

    Returns
    -------
    periods: Dict[int, int]
        Dictionary with start and endframes:
            ``periods[segment] = (startframe, endframe)``.
    pitch: Pitch
        Playing Pitch object.
    """

    # create pitch
    pi_len = csv_df["pitch_dimension_long_side"].values[0]
    pi_wid = csv_df["pitch_dimension_short_side"].values[0]
    pitch = Pitch.from_template(
        "statsperform_open",
        length=pi_len,
        width=pi_wid,
        sport="football",
    )

    # create periods for segments, coded as jumps in the frame sequence
    periods = {}
    frame_values = csv_df["frame_count"].unique()

    seg_idx = np.where(np.diff(frame_values, prepend=frame_values[0]) > 1)
    seg_idx = np.insert(seg_idx, 0, 0)
    seg_idx = np.append(seg_idx, len(frame_values))
    for segment in range(len(seg_idx) - 1):
        start = int(frame_values[seg_idx[segment]])
        end = int(frame_values[seg_idx[segment + 1] - 1])
        periods[segment] = (start, end)

    return periods, pitch


def _read_open_event_csv_single_line(
    line: str,
) -> Tuple[Dict, str, str]:
    """Extracts all relevant information from a single line of StatsPerform's Event csv
    file (i.e. one single event in the data).

    Parameters
    ----------
    line: str
        One full line from StatsPerform's Event CSV file.

    Returns
    -------
    event: Dict
        Dictionary with relevant event information in the form:
        ``event[attribute] = value``.
    """
    event = {}
    attrib = line.split(sep=",")

    # description
    event["eID"] = attrib[5].replace(" ", "")

    # relative time
    event["gameclock"] = float(attrib[4])
    event["frameclock"] = float(attrib[2])

    # segment, player and team
    segment = attrib[3]
    team = attrib[9]
    event["tID"] = team
    event["pID"] = attrib[8]

    # outcome
    event["outcome"] = np.nan
    if "Won" in attrib[5].split(" "):
        event["outcome"] = 1
    elif "Lost" in attrib[5].split(" "):
        event["outcome"] = 0

    # minute and second of game
    event["minute"] = np.floor(event["gameclock"] / 60)
    event["second"] = np.floor(event["gameclock"] - event["minute"] * 60)

    # additional information (qualifier)
    event["qualifier"] = {
        "event_id": attrib[1],
        "event_type_id": attrib[6],
        "sequencenumber": attrib[7],
        "jersey_no": attrib[10],
        "is_pass": attrib[11],
        "is_cross": attrib[12],
        "is_corner": attrib[13],
        "is_free_kick": attrib[14],
        "is_goal_kick": attrib[15],
        "passtypeid": attrib[16],
        "wintypeid": attrib[17],
        "savetypeid": attrib[18],
        "possessionnumber": attrib[19],
    }

    return event, team, segment


[docs]def read_teamsheets_from_open_data_csv( filepath_csv: Union[str, Path] ) -> Dict[str, Teamsheet]: """Parses the entire open StatsPerform position data CSV file for unique jIDs (jerseynumbers) and creates teamsheets for both teams. Parameters ---------- filepath_csv: str or pathlib.Path CSV file containing either open position or open event data. Returns ------- teamsheets: Dict[str, Teamsheet] Dictionary with teamsheets for the home team and the away team. Notes ----- Statsperform open data does not contain any player names. Thus, the teamsheet objects generated by this method will name players 'Player i' with i starting at 1. To identify players, use the jersey numbers of players or provide custom teamsheets generated by a different parser if Statsperform open data is used in combination with other data providers. """ # read dat-file into pd.DataFrame csv_df = pd.read_csv(str(filepath_csv)) # initialize team and ball ids team_ids = {"Home": 1.0, "Away": 2.0} ball_id = 4 # check for additional tIDs for tID in csv_df["team_id"].unique(): if not (tID in team_ids.values() or tID == ball_id or np.isnan(tID)): warnings.warn( f"tID {tID} did not match any of the standard tIDs " f"({team_ids.values}) or the ball ID ({ball_id})!" ) # initialize teamsheets teamsheets = { "Home": pd.DataFrame(columns=["player", "jID", "pID", "tID"]), "Away": pd.DataFrame(columns=["player", "jID", "pID", "tID"]), } # loop over teams for team in team_ids: # extract list with pID and jID information for all players in the team team_id = team_ids[team] team_df = csv_df[csv_df["team_id"] == team_id] jIDs = team_df["jersey_no"].unique() pIDs = [ team_df[team_df["jersey_no"] == jID]["player_id"].unique() for jID in jIDs ] # possible check for multiple pIDs assigned to a single jID # insert data to teamsheet teamsheets[team]["player"] = [f"Player {i}" for i in range(len(pIDs))] teamsheets[team]["jID"] = [jID for jID in jIDs] teamsheets[team]["pID"] = [pID[0] for pID in pIDs] teamsheets[team]["tID"] = team_id # create teamsheet objects for team in teamsheets: teamsheets[team] = Teamsheet(teamsheets[team]) return teamsheets
[docs]def read_open_event_data_csv( filepath_events: Union[str, Path], teamsheet_home: Teamsheet = None, teamsheet_away: Teamsheet = None, ) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]: """Parses an open StatsPerform Match Event CSV file and extracts the event data and teamsheets. This function provides high-level access to the particular openly published StatsPerform match events CSV file (e.g. for the Pro Forum '22) and returns Event objects for both teams. Parameters ---------- filepath_events: str or pathlib.Path Full path to xml File where the Event data in StatsPerform csv format is saved teamsheet_home: Teamsheet, optional Teamsheet-object for the home team. If given as None (default), teamsheet is extracted from the event data CSV file. teamsheet_away: Teamsheet, optional Teamsheet-object for the away team. If given as None (default), teamsheet is extracted from the event data CSV file. Returns ------- data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]] Tuple of (nested) floodlight core objects with shape (events_objects, teamsheets). ``events_objects`` is a nested dictionary containing ``Events`` objects for each team and segment of the form ``events_objects[segment][team] = Events``. For a typical league match with two halves and teams this dictionary looks like: ``{'1': {'Home': Events, 'Away': Events}, '2': {'Home': Events, 'Away': Events} }``. ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team of the form ``teamsheets[team] = Teamsheet``. Notes ----- StatsPerform's open format of handling provides certain additional event attributes, which attach additional information to certain events. As of now, these information are parsed as a string in the ``qualifier`` column of the returned DataFrame and can be transformed to a dict of form ``{attribute: value}``. """ # initialize bin and variables events = {} team_ids = {"Home": 1.0, "Away": 2.0} segments = ["1", "2"] for team in team_ids.values(): events[team] = {segment: pd.DataFrame() for segment in segments} # create or check teamsheet objects if teamsheet_home is None and teamsheet_away is None: teamsheets = read_teamsheets_from_open_data_csv(filepath_events) teamsheet_home = teamsheets["Home"] teamsheet_away = teamsheets["Away"] elif teamsheet_home is None: teamsheets = read_teamsheets_from_open_data_csv(filepath_events) teamsheet_home = teamsheets["Home"] elif teamsheet_away is None: teamsheets = read_teamsheets_from_open_data_csv(filepath_events) teamsheet_away = teamsheets["Away"] else: pass # potential check # parse event data with open(str(filepath_events), "r") as f: while True: line = f.readline() # terminate if at end of file if len(line) == 0: break # skip the head if line.split(sep=",")[3] == "current_phase": continue # read single line event, team, segment = _read_open_event_csv_single_line(line) # insert to bin if team: team = float(team) events[team][segment] = events[team][segment].append( event, ignore_index=True ) else: # if no clear assignment possible, insert to bins for both teams for team in team_ids.values(): events[team][segment] = events[team][segment].append( event, ignore_index=True ) # create objects events_objects = {} for segment in segments: events_objects[segment] = {} for team in ["Home", "Away"]: events_objects[segment][team] = Events( events=pd.DataFrame(data=events[team_ids[team]][segment]), ) teamsheets = { "Home": teamsheet_home, "Away": teamsheet_away, } # pack objects data_objects = (events_objects, teamsheets) return data_objects
[docs]def read_open_position_data_csv( filepath_position: Union[str, Path], teamsheet_home: Teamsheet = None, teamsheet_away: Teamsheet = None, ) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Code], Dict[str, Teamsheet], Pitch]: """Parses an open StatsPerform CSV file and extract position data and possession codes as well as teamsheets and pitch information. Openly published StatsPerform position data (e.g. for the Pro Forum '22) is stored in a CSV file containing all position data (for both halves) as well as information about players, the pitch, and the ball possession. This function provides high-level access to StatsPerform data by parsing the CSV file. Parameters ---------- filepath_position: str or pathlib.Path Full path to the CSV file. teamsheet_home: Teamsheet, optional Teamsheet-object for the home team used to create link dictionaries of the form `links[team][jID] = xID`. The links are used to map players to a specific xID in the respective XY objects. Should be supplied for custom ordering. If given as None (default), teamsheet is extracted from the open StatsPerform CSV file and its xIDs are assigned in order of appearance. teamsheet_away: Teamsheet, optional Teamsheet-object for the away team. If given as None (default), teamsheet is extracted from the Match Information XML file. See teamsheet_home for details. Returns ------- data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Code], \ Dict[str, Teamsheet], Pitch] Tuple of (nested) floodlight core objects with shape (xy_objects, possession_objects, teamsheets, pitch). ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team and segment of the form ``xy_objects[segment][team] = XY``. For a typical league match with two halves and teams this dictionary looks like: ``{0: {'Home': XY, 'Away': XY}, 1: {'Home': XY, 'Away': XY}}``. ``possession_objects`` is a dictionary containing ``Code`` objects with possession information (home or away) for each segment of the form ``possession_objects[segment] = Code``. ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team of the form ``teamsheets[team] = Teamsheet``. ``pitch`` is a ``Pitch`` object corresponding to the data. """ # parse the CSV file into pd.DataFrame dat_df = pd.read_csv(str(filepath_position)) # initialize team and ball ids team_ids = {"Home": 1.0, "Away": 2.0} ball_id = 4 # check for additional tIDs for ID in dat_df["team_id"].unique(): if not (ID in team_ids.values() or ID == ball_id): warnings.warn(f"Team ID {ID} did not match any of the standard IDs!") # create or check teamsheet objects if teamsheet_home is None and teamsheet_away is None: teamsheets = read_teamsheets_from_open_data_csv(filepath_position) teamsheet_home = teamsheets["Home"] teamsheet_away = teamsheets["Away"] elif teamsheet_home is None: teamsheets = read_teamsheets_from_open_data_csv(filepath_position) teamsheet_home = teamsheets["Home"] elif teamsheet_away is None: teamsheets = read_teamsheets_from_open_data_csv(filepath_position) teamsheet_away = teamsheets["Away"] else: pass # potential check # create links if "xID" not in teamsheet_home.teamsheet.columns: teamsheet_home.add_xIDs() if "xID" not in teamsheet_away.teamsheet.columns: teamsheet_away.add_xIDs() links_jID_to_xID = {} links_jID_to_xID["Home"] = teamsheet_home.get_links("jID", "xID") links_jID_to_xID["Away"] = teamsheet_away.get_links("jID", "xID") # create periods and pitch periods, pitch = _create_metadata_from_open_csv_df(dat_df) segments = list(periods.keys()) # infer data shapes number_of_players = {team: len(links_jID_to_xID[team]) for team in links_jID_to_xID} number_of_frames = {} for segment in segments: start = periods[segment][0] end = periods[segment][1] number_of_frames[segment] = end - start + 1 # bins codes = {"possession": {segment: [] for segment in segments}} xydata = { "Home": { segment: np.full( [ number_of_frames[segment], number_of_players[list(links_jID_to_xID.keys())[0]] * 2, ], np.nan, ) for segment in periods }, "Away": { segment: np.full( [ number_of_frames[segment], number_of_players[list(links_jID_to_xID.keys())[1]] * 2, ], np.nan, ) for segment in periods }, "Ball": { segment: np.full([number_of_frames[segment], 2], np.nan) for segment in periods }, } # loop for segment in segments: # teams for team in team_ids: team_df = dat_df[dat_df["team_id"] == team_ids[team]] for pID in team_df["player_id"].unique(): # extract player information pl_df = team_df[team_df["player_id"] == pID] frames = pl_df["frame_count"].values x_position = pl_df["pos_x"].values y_position = pl_df["pos_y"].values # compute appearance of player in segment appearance = np.array( [ (periods[segment][0] <= frame <= periods[segment][-1]) for frame in frames ] ) # check for players that did not play in segment if not np.sum(appearance): continue # insert player position to bin array jrsy = int(pl_df["jersey_no"].values[0]) x_col = (links_jID_to_xID[team][jrsy] - 1) * 2 y_col = (links_jID_to_xID[team][jrsy] - 1) * 2 + 1 start = frames[appearance][0] - periods[segment][0] end = frames[appearance][-1] - periods[segment][0] + 1 xydata[team][segment][start:end, x_col] = x_position[appearance] xydata[team][segment][start:end, y_col] = y_position[appearance] # ball ball_df = dat_df[dat_df["team_id"] == 4] frames = ball_df["frame_count"].values appearance = np.array( [(periods[segment][0] <= frame <= periods[segment][-1]) for frame in frames] ) xydata["Ball"][segment][:, 0] = ball_df["pos_x"].values[appearance] xydata["Ball"][segment][:, 1] = ball_df["pos_x"].values[appearance] # update codes codes["possession"][segment] = ball_df["possession"].values[appearance] # create objects xy_objects = {} possession_objects = {} for segment in segments: xy_objects[segment] = {} possession_objects[segment] = Code( code=codes["possession"][segment], name="possession", definitions=dict([(team_id, team) for team, team_id in team_ids.items()]), framerate=10, ) for team in ["Home", "Away", "Ball"]: xy_objects[segment][team] = XY( xy=xydata[team][segment], framerate=10, ) teamsheets = { "Home": teamsheet_home, "Away": teamsheet_away, } # pack objects data_objects = ( xy_objects, possession_objects, teamsheets, pitch, ) return data_objects
# ----------------------------- StatsPerform Format --------------------------- def _read_position_data_txt_single_line( line: str, ) -> Tuple[ int, int, Dict[str, Dict[str, Tuple[float, float, float]]], Dict[str, Union[str, tuple]], ]: """Extracts all relevant information from a single line of StatsPerform's position data TXT file (i.e. one frame of data). Parameters ---------- line: str One full line from StatsPerform's .txt-file, equals one sample of data. Returns ------- gameclock: int The gameclock of the current segment in milliseconds. segment: int The segment identifier. positions: Dict[str, Dict[str, Tuple[float, float, float]]] Nested dictionary that stores player position information for each team and player. Has the form ``positions[team][jID] = (x, y)``. ball: Dict[str] Dictionary with ball information. Has keys 'position', 'possession' and 'ballstatus'. """ # bins positions = {"Home": {}, "Away": {}, "Other": {}} ball = {} # read chunks chunks = line.split(":") time_chunk = chunks[0] player_chunks = chunks[1].split(";") ball_chunk = None if len(chunks) > 2: # check if ball information exist in chunk ball_chunk = chunks[2] # time chunk # systemclock = time_chunk.split(";")[0] # possible check or synchronization step timeinfo = time_chunk.split(";")[1].split(",") gameclock = int(timeinfo[0]) segment = int(timeinfo[1]) # ballstatus = timeinfo[2].split(":")[0] == '0' # '0' seems to be always the case? # player chunks for player_chunk in player_chunks: # skip final entry of chunk if not player_chunk or player_chunk == "\n": continue # read team chunk_data = player_chunk.split(",") if chunk_data[0] in ["0", "3"]: team = "Home" elif chunk_data[0] in ["1", "4"]: team = "Away" else: team = "Other" # read IDs # pID = chunk_data[1] jID = chunk_data[2] # read positions x, y = map(lambda x: float(x), chunk_data[3:]) # assign positions[team][jID] = (x, y) # ball chunk if ball_chunk is not None: x, y, z = map(lambda x: float(x), ball_chunk.split(";")[0].split(",")) # ball["position"] = (x, y, z) # z-coordinate is not yet supported ball["position"] = (x, y) return gameclock, segment, positions, ball def _read_time_information_from_position_data_txt( filepath_position: Union[str, Path], ) -> Tuple[Dict, Union[int, None]]: """Reads StatsPerform's position TXT file and extracts information about the first and last frame of periods. Also, a framerate is estimated from the gameclock difference between samples. Parameters ---------- filepath_position: str or pathlib.Path Full path to the TXT file containing the position data. Returns ------- periods: Dict Dictionary with start and endframes: ``periods[segment] = [startframe, endframe]``. framerate_est: int or None Estimated temporal resolution of data in frames per second/Hertz. """ # bins startframes = {} endframes = {} framerate_est = None # read TXT file from disk file_txt = open(filepath_position, "r") # loop last_gameclock = None last_segment = None for line in file_txt.readlines(): # read gameclock and segment gameclock, segment, _, _ = _read_position_data_txt_single_line(line) # update periods if segment not in startframes: startframes[segment] = gameclock if last_gameclock is not None: endframes[last_segment] = last_gameclock # estimate framerate if desired if last_gameclock is not None: delta = np.absolute(gameclock - last_gameclock) # in milliseconds if framerate_est is None: framerate_est = int(1000 / delta) elif framerate_est != int(1000 / delta) and last_segment == segment: warnings.warn( f"Framerate estimation yielded diverging results." f"The originally estimated framerate of {framerate_est} Hz did not " f"match the current estimation of {int(1000 / delta)} Hz. This " f"might be caused by missing frame(s) in the position data. " f"Continuing by choosing the latest estimation of " f"{int(1000 / delta)} Hz" ) framerate_est = int(1000 / delta) # update variables last_gameclock = gameclock last_segment = segment # update end of final segment endframes[last_segment] = last_gameclock # assembly periods = { segment: (startframes[segment], endframes[segment]) for segment in startframes } # close file file_txt.close() return periods, framerate_est def _read_jersey_numbers_from_position_data_txt( file_location_txt: Union[str, Path], ) -> Tuple[set, set]: """Reads StatsPerform's position TXT file and extracts unique set of jIDs (jerseynumbers) for both teams. Parameters ---------- file_location_txt: str or pathlib.Path Full path to the TXT file containing the position data. Returns ------- home_jIDs: set away_jIDs: set """ # bins home_jIDs = set() away_jIDs = set() # read TXT file from disk file_txt = open(file_location_txt, "r") # loop for package in file_txt.readlines(): # read line _, _, positions, _ = _read_position_data_txt_single_line(package) # extract jersey numbers home_jIDs |= set(positions["Home"].keys()) away_jIDs |= set(positions["Away"].keys()) # close file file_txt.close() return home_jIDs, away_jIDs
[docs]def read_teamsheets_from_event_data_xml( filepath_events: Union[str, Path], ) -> Dict[str, Teamsheet]: """Parses the StatsPerform event file and returns two Teamsheet-objects with detailed player information for the home and the away team. Parameters ---------- filepath_events: str or pathlib.Path Full path to the XML file containing the event data. Returns ------- teamsheets: Dict[str, Teamsheet] Dictionary with teamsheets for the home team and the away team. """ # load event data xml tree into memory tree = etree.parse(str(filepath_events)) root = tree.getroot() # initialize teamsheets teamsheets = { "Home": pd.DataFrame( columns=["player", "position", "team_name", "jID", "pID", "tID", "started"] ), "Away": pd.DataFrame( columns=["player", "position", "team_name", "jID", "pID", "tID", "started"] ), } # parse player information for team_matchsheet in root.findall("MatchSheet/Team"): # skip referees if team_matchsheet.attrib["Type"] == "Referees": continue # read team team = team_matchsheet.attrib["Type"][:-4] # cut 'Team' of e.g. 'HomeTeam' tID = team_matchsheet.attrib["IdTeam"] team_name = team_matchsheet.attrib["Name"] # find players players = [ actor for actor in team_matchsheet.findall("Actor") if actor.attrib["Occupation"] == "Player" ] # create teamsheet teamsheets[team]["player"] = [ get_and_convert(player, "NickName", str) for player in players ] teamsheets[team]["pID"] = [ get_and_convert(player, "IdActor", int) for player in players ] teamsheets[team]["jID"] = [ get_and_convert(player, "JerseyNumber", int) for player in players ] teamsheets[team]["position"] = [ get_and_convert(player, "Position", str) for player in players ] teamsheets[team]["started"] = [ player.get("IsStarter") == "True" for player in players ] teamsheets[team]["tID"] = tID teamsheets[team]["team_name"] = team_name # create teamsheet objects for team in teamsheets: teamsheets[team] = Teamsheet(teamsheets[team]) return teamsheets
[docs]def read_teamsheets_from_position_data_txt( filepath_position: Union[str, Path], ) -> Dict[str, Teamsheet]: """Parses the StatsPerform position file and returns two simple Teamsheet-objects containing only two columns "player" and "jID" for the home and the away team. Parameters ---------- filepath_position: str or pathlib.Path Full path to the TXT file containing the position data. Returns ------- teamsheets: Dict[str, Teamsheet] Dictionary with teamsheets for the home team and the away team. """ # create list of jIDs homejrsy, awayjrsy = _read_jersey_numbers_from_position_data_txt(filepath_position) homejrsy = list(homejrsy) awayjrsy = list(awayjrsy) homejrsy.sort() awayjrsy.sort() jIDs = { "Home": homejrsy, "Away": awayjrsy, } # create teamsheets teamsheets = { "Home": pd.DataFrame(columns=["player", "jID"]), "Away": pd.DataFrame(columns=["player", "jID"]), } for team in teamsheets: teamsheets[team]["player"] = [f"Player {i}" for i in range(len(jIDs[team]))] teamsheets[team]["jID"] = [int(jID) for jID in jIDs[team]] # create teamsheet objects for team in teamsheets: teamsheets[team] = Teamsheet(teamsheets[team]) return teamsheets
[docs]def read_event_data_xml( filepath_events: Union[str, Path], teamsheet_home: Teamsheet = None, teamsheet_away: Teamsheet = None, ) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]: """Parses a StatsPerform XML file and extracts event data and pitch information. This function provides high-level access to the StatsPerform match events XML file and returns Events objects for both teams and information about the pitch. Parameters ---------- filepath_events: str or pathlib.Path Full path to the XML file containing the event data. teamsheet_home: Teamsheet, optional Teamsheet-object for the home team used to create link dictionaries of the form `links[pID] = team`. The links are used to map players to the home and away teams. If given as None (default), teamsheet is extracted from the event data XML file. teamsheet_away: Teamsheet, optional Teamsheet-object for the away team. If given as None (default), teamsheet is extracted from the event data XML file. See teamsheet_home for details. Returns ------- data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch] Tuple of (nested) floodlight core objects with shape (events_objects, teamsheets, pitch). ``events_objects`` is a nested dictionary containing ``Events`` objects for each team and segment of the form ``events_objects[segment][team] = Events``. For a typical league match with two halves and teams this dictionary looks like: ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away': Events}}``. ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team of the form ``teamsheets[team] = Teamsheet``. ``pitch`` is a ``Pitch`` object corresponding to the data. """ # load xml tree into memory tree = etree.parse(str(filepath_events)) root = tree.getroot() # create bins, read segments, and assign teams columns = [ "eID", "gameclock", "pID", "minute", "second", "at_x", "at_y", "to_x", "to_y", "qualifier", ] segments = [ f"HT{get_and_convert(period.attrib, 'IdHalf', str)}" for period in root.findall("Events/EventsHalf") ] teams = ["Home", "Away"] # create or check teamsheet objects if teamsheet_home is None and teamsheet_away is None: teamsheets = read_teamsheets_from_event_data_xml(filepath_events) teamsheet_home = teamsheets["Home"] teamsheet_away = teamsheets["Away"] elif teamsheet_home is None: teamsheets = read_teamsheets_from_event_data_xml(filepath_events) teamsheet_home = teamsheets["Home"] elif teamsheet_away is None: teamsheets = read_teamsheets_from_event_data_xml(filepath_events) teamsheet_away = teamsheets["Away"] else: pass # potential check # create links between pIDs and team links_pID_to_team = {} links_pID_to_team.update({pID: "Home" for pID in teamsheet_home["pID"]}) links_pID_to_team.update({pID: "Away" for pID in teamsheet_away["pID"]}) # bins event_lists = { team: {segment: {col: [] for col in columns} for segment in segments} for team in teams } # loop over events for half in root.findall("Events/EventsHalf"): # get segment information period = get_and_convert(half.attrib, "IdHalf", str) segment = "HT" + str(period) for event in half.findall("Event"): # read pID pID = get_and_convert(event.attrib, "IdActor1", int) # assign team team = get_and_convert(links_pID_to_team, pID, str) # create list of either a single team or both teams if no clear assignment if team == "None": teams_assigned = teams # add to both teams else: teams_assigned = [team] # only add to one team # identifier eID = get_and_convert(event.attrib, "EventName", str) for team in teams_assigned: event_lists[team][segment]["eID"].append(eID) event_lists[team][segment]["pID"].append(pID) # relative time gameclock = get_and_convert(event.attrib, "Time", int) / 1000 minute = np.floor(gameclock / 60) second = np.floor(gameclock - minute * 60) for team in teams_assigned: event_lists[team][segment]["gameclock"].append(gameclock) event_lists[team][segment]["minute"].append(minute) event_lists[team][segment]["second"].append(second) # location at_x = get_and_convert(event.attrib, "LocationX", float) at_y = get_and_convert(event.attrib, "LocationY", float) to_x = get_and_convert(event.attrib, "TargetX", float) to_y = get_and_convert(event.attrib, "TargetY", float) for team in teams_assigned: event_lists[team][segment]["at_x"].append(at_x) event_lists[team][segment]["at_y"].append(at_y) event_lists[team][segment]["to_x"].append(to_x) event_lists[team][segment]["to_y"].append(to_y) # qualifier qual_dict = {} for qual_id in event.attrib: qual_value = event.attrib.get(qual_id) qual_dict[qual_id] = qual_value for team in teams_assigned: event_lists[team][segment]["qualifier"].append(str(qual_dict)) # create pitch length = get_and_convert(root.attrib, "FieldLength", int) / 100 width = get_and_convert(root.attrib, "FieldWidth", int) / 100 pitch = Pitch.from_template( "statsperform_event", length=length, width=width, sport="football", ) # create objects events_objects = {} for segment in segments: events_objects[segment] = {} for team in ["Home", "Away"]: events_objects[segment][team] = Events( events=pd.DataFrame(data=event_lists[team][segment]), ) teamsheets = { "Home": teamsheet_home, "Away": teamsheet_away, } # pack objects data_objects = (events_objects, teamsheets, pitch) return data_objects
[docs]def read_position_data_txt( filepath_position: Union[str, Path], teamsheet_home: Teamsheet = None, teamsheet_away: Teamsheet = None, ) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]: """Parses a StatsPerform TXT file and extracts position data and teamsheets. Internal StatsPerform position data is stored as a TXT file containing all position data (for both halves). This function provides high-level access to StatsPerform data by parsing the TXT file. Since no information about framerate is delivered in the data itself, it is estimated from time difference between individual frames. Teamsheets are extracted from the event data, if filepath_events is provided. Otherwise, minimal Teamsheet-objects are inferred from the position data. Parameters ---------- filepath_position: str or pathlib.Path Full path to the TXT file containing the position data. teamsheet_home: Teamsheet, optional Teamsheet-object for the home team used to create link dictionaries of the form `links[team][jID] = xID`. The links are used to map players to a specific xID in the respective XY objects. Should be supplied for custom ordering. If given as None (default), teamsheet is extracted from the position data TXT file and its xIDs are assigned in order of appearance. teamsheet_away: Teamsheet, optional Teamsheet-object for the away team. If given as None (default), teamsheet is extracted from the position data TXT file. See teamsheet_home for details. Returns ------- data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]] Tuple of (nested) floodlight core objects with shape (xy_objects, teamsheets). ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team and segment of the form ``xy_objects[segment][team] = XY``. For a typical league match with two halves and teams this dictionary looks like: ``{1: {'Home': XY, 'Away': XY}, 2: {'Home': XY, 'Away': XY}}``. ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team of the form ``teamsheets[team] = Teamsheet``. Notes ----- Statsperform position data does not contain any player information expect jersey numbers by default. Thus, the teamsheet objects generated by this method will name players 'Player i' with i starting at 1. To identify players, use the jersey numbers of players or provide custom teamsheets (e.g. by parsing teamsheets from the Statsperform event data or another data provider). """ # parse TXT file for periods and estimate framerate if not contained in filepath periods, framerate_est = _read_time_information_from_position_data_txt( filepath_position ) segments = list(periods.keys()) # create or check teamsheet objects if teamsheet_home is None and teamsheet_away is None: teamsheets = read_teamsheets_from_position_data_txt(filepath_position) teamsheet_home = teamsheets["Home"] teamsheet_away = teamsheets["Away"] elif teamsheet_home is None: teamsheets = read_teamsheets_from_position_data_txt(filepath_position) teamsheet_home = teamsheets["Home"] elif teamsheet_away is None: teamsheets = read_teamsheets_from_position_data_txt(filepath_position) teamsheet_away = teamsheets["Away"] else: pass # potential check # create links if "xID" not in teamsheet_home.teamsheet.columns: teamsheet_home.add_xIDs() if "xID" not in teamsheet_away.teamsheet.columns: teamsheet_away.add_xIDs() links_jID_to_xID = {} links_jID_to_xID["Home"] = teamsheet_home.get_links("jID", "xID") links_jID_to_xID["Away"] = teamsheet_away.get_links("jID", "xID") # infer data array shapes number_of_home_players = max(links_jID_to_xID["Home"].values()) + 1 number_of_away_players = max(links_jID_to_xID["Away"].values()) + 1 number_of_frames = {} for segment in segments: number_of_frames[segment] = ( int((periods[segment][1] - periods[segment][0]) / 1000 * framerate_est) + 1 ) # bins xydata = {} xydata["Home"] = { segment: np.full( [number_of_frames[segment], number_of_home_players * 2], np.nan ) for segment in segments } xydata["Away"] = { segment: np.full( [number_of_frames[segment], number_of_away_players * 2], np.nan ) for segment in segments } xydata["Ball"] = { segment: np.full([number_of_frames[segment], 2], np.nan) for segment in segments } # read TXT file from disk with open(filepath_position, "r") as f: tracking_data_lines = f.readlines() # loop for package in tracking_data_lines: # read line to get gameclock, player positions and ball info ( gameclock, segment, positions, ball, ) = _read_position_data_txt_single_line(package) # check if frame is in any segment if segment is None: # skip line if not continue else: # otherwise calculate relative frame (in respective segment) frame_rel = int((gameclock - periods[segment][0]) / 1000 * framerate_est) # insert (x,y)-data into np.array for team in ["Home", "Away"]: for jID in positions[team].keys(): # map jersey number to array index and infer respective columns x_col = (links_jID_to_xID[team][int(jID)] - 1) * 2 y_col = (links_jID_to_xID[team][int(jID)] - 1) * 2 + 1 xydata[team][segment][frame_rel, x_col] = positions[team][jID][0] xydata[team][segment][frame_rel, y_col] = positions[team][jID][1] # get ball data xydata["Ball"][segment][frame_rel] = ball.get("position", np.nan) # create objects xy_objects = {} for segment in segments: xy_objects[segment] = {} for team in ["Home", "Away", "Ball"]: xy_objects[segment][team] = XY( xy=xydata[team][segment], framerate=framerate_est, ) teamsheets = { "Home": teamsheet_home, "Away": teamsheet_away, } # pack objects data_objects = ( xy_objects, teamsheets, ) return data_objects
[docs]def read_event_data_from_url( url: str, teamsheet_home: Teamsheet = None, teamsheet_away: Teamsheet = None, ) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]: """Reads a URL containing a StatsPerform events CSV file and extracts the stored event data, pitch information, and teamsheets. The event data from the URL is downloaded into a temporary file stored in the repository's internal root ``.data``-folder and removed afterwards. Parameters ---------- url: str URL to the XML file containing the event data. teamsheet_home: Teamsheet, optional Teamsheet-object for the home team used to create link dictionaries of the form `links[pID] = team`. The links are used to map players to the home and away teams. If given as None (default), teamsheet is extracted from the event data XML file. teamsheet_away: Teamsheet, optional Teamsheet-object for the away team. If given as None (default), teamsheet is extracted from the event data XML file. See teamsheet_home for details. Returns ------- data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch] Tuple of (nested) floodlight core objects with shape (events_objects, teamsheets, pitch). ``events_objects`` is a nested dictionary containing ``Events`` objects for each team and segment of the form ``events_objects[segment][team] = Events``. For a typical league match with two halves and teams this dictionary looks like: ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away': Events}}``. ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team of the form ``teamsheets[team] = Teamsheet``. ``pitch`` is a ``Pitch`` object corresponding to the data. """ data_dir = os.path.join(DATA_DIR, "statsperform") if not os.path.isdir(data_dir): os.makedirs(data_dir, exist_ok=True) temp_file = os.path.join(data_dir, "events_temp.xml") with open(temp_file, "wb") as binary_file: binary_file.write(download_from_url(url)) events_objects, teamsheets, pitch = read_event_data_xml( filepath_events=os.path.join(data_dir, temp_file), teamsheet_home=teamsheet_home, teamsheet_away=teamsheet_away, ) data_objects = (events_objects, teamsheets, pitch) os.remove(os.path.join(data_dir, temp_file)) return data_objects
[docs]def read_position_data_from_url( url: str, teamsheet_home: Teamsheet = None, teamsheet_away: Teamsheet = None, ) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]: """Reads a URL from the StatsPerform API (StatsEdgeViewer) containing a position data TXT file and extracts position data and teamsheets. The position data from the URL is downloaded into a temporary file stored in the repository's internal root ``.data``-folder and removed afterwards. Parameters ---------- url: str or pathlib.Path URL to the TXT file containing the position data. teamsheet_home: Teamsheet, optional Teamsheet-object for the home team used to create link dictionaries of the form `links[team][jID] = xID`. The links are used to map players to a specific xID in the respective XY objects. Should be supplied for custom ordering. If given as None (default), teamsheet is extracted from the position data TXT file and its xIDs are assigned in order of appearance. teamsheet_away: Teamsheet, optional Teamsheet-object for the away team. If given as None (default), teamsheet is extracted from the position data TXT file. See teamsheet_home for details. Returns ------- data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]] Tuple of (nested) floodlight core objects with shape (xy_objects, teamsheets). ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team and segment of the form ``xy_objects[segment][team] = XY``. For a typical league match with two halves and teams this dictionary looks like: ``{1: {'Home': XY, 'Away': XY}, 2: {'Home': XY, 'Away': XY}}``. ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team of the form ``teamsheets[team] = Teamsheet``. Notes ----- Statsperform position data does not contain any player information expect jersey numbers by default. Thus, the teamsheet objects generated by this method will name players 'Player i' with i starting at 1. To identify players, use the jersey numbers of players or provide custom teamsheets (e.g. by parsing teamsheets from the Statsperform event data or another data provider). """ data_dir = os.path.join(DATA_DIR, "statsperform") if not os.path.isdir(data_dir): os.makedirs(data_dir, exist_ok=True) temp_file = os.path.join(data_dir, "tracking_temp.txt") with open(temp_file, "wb") as binary_file: binary_file.write(download_from_url(url)) xy_objects, teamsheets = read_position_data_txt( filepath_position=os.path.join(data_dir, temp_file), teamsheet_home=teamsheet_home, teamsheet_away=teamsheet_away, ) data_objects = (xy_objects, teamsheets) os.remove(os.path.join(data_dir, temp_file)) return data_objects