Source code for floodlight.io.tracab

import json
from pathlib import Path
from typing import Dict, Tuple, Union

import numpy as np
import pandas as pd
from lxml import etree

from floodlight.core.code import Code
from floodlight.core.pitch import Pitch
from floodlight.core.xy import XY
from floodlight.core.teamsheet import Teamsheet
from floodlight.io.utils import get_and_convert


def _read_metadata_from_xml(
    filepath_metadata: Union[str, Path]
) -> Tuple[Dict, Dict, Pitch]:
    """Reads TRACAB's metadata file (xml format) and extracts match meta information
    such as framerate, periods and pitch.

    Parameters
    ----------
    filepath_metadata: str or pathlib.Path
        Full path to _metadata.xml file.

    Returns
    -------
    metainfo: Dict
        Dictionary with metainformation such as framerate.
    periods: Dict
        Dictionary with start and endframes:
        `periods[segment] = (startframe, endframe)`.
    pitch: Pitch
        Pitch object with actual pitch length and width.
    """
    #  set up XML tree
    tree = etree.parse(str(filepath_metadata))
    root = tree.getroot()

    # parse XML file, extract matchinfo and period start/endframes
    metadata = {}
    periods = {}
    attributes = root.find("match").attrib

    framerate = attributes.get("iFrameRateFps")
    metadata["framerate"] = int(framerate) if framerate else None

    length = attributes.get("fPitchXSizeMeters")
    metadata["length"] = float(length) if length else None

    width = attributes.get("fPitchYSizeMeters")
    metadata["width"] = float(width) if width else None

    for elem in root.findall("match/period"):
        if elem.attrib["iEndFrame"] != "0":
            segment = "HT" + elem.attrib["iId"]
            start = int(elem.attrib["iStartFrame"])
            end = int(elem.attrib["iEndFrame"])
            periods[segment] = (start, end)

    pitch = Pitch.from_template(
        "tracab",
        length=float(metadata["length"]),
        width=float(metadata["width"]),
        sport="football",
    )

    return metadata, periods, pitch


def _read_metadata_from_json(
    filepath_metadata: Union[str, Path]
) -> Tuple[Dict, Dict, Pitch]:
    """Reads TRACAB's metadata file (json format) and extracts match meta information
    such as framerate, periods and pitch.

    Parameters
    ----------
    filepath_metadata: str or pathlib.Path
        Full path to _metadata.json file.

    Returns
    -------
    metadata: Dict
        Dictionary with metainformation such as framerate.
    periods: Dict
        Dictionary with start and endframes:
        `periods[segment] = (startframe, endframe)`.
    pitch: Pitch
        Pitch object with actual pitch length and width.
    """
    # load file
    with open(filepath_metadata, "r", encoding="utf8") as f:
        metafile = json.load(f)

    # bin
    metadata = {}
    periods = {}

    # get framerate
    metadata["framerate"] = get_and_convert(metafile, "FrameRate", int)

    # get length and width and convert from cm to m
    length = get_and_convert(metafile, "PitchLongSide", float)
    width = get_and_convert(metafile, "PitchShortSide", float)
    metadata["length"] = length / 100 if length else None
    metadata["width"] = width / 100 if width else None

    # get period start and end frames
    for i in range(1, 6):
        phase = f"Phase{i}"
        ht = f"HT{i}"
        phase_start = get_and_convert(metafile, phase + "StartFrame", int)
        phase_end = get_and_convert(metafile, phase + "EndFrame", int)
        if phase_start is None or phase_end is None:
            continue
        if phase_start == 0 or phase_end == 0:
            continue
        periods[ht] = (phase_start, phase_end)

    # create pitch
    pitch = Pitch.from_template(
        "tracab",
        length=metadata["length"],
        width=metadata["width"],
        sport="football",
    )

    return metadata, periods, pitch


def _read_dat_single_line(
    package: str,
) -> Tuple[
    int, Dict[str, Dict[str, Tuple[float, float, float]]], Dict[str, Union[str, tuple]]
]:
    """Extracts all relevant information from a single line of TRACAB's .dat file
    (i.e. one frame of data).

    Parameters
    ----------
    package: str
        One full line from TRACAB's .dat-file, equals one "package" according to the
        file-format documentation.

    Returns
    -------
    frame_number: int
        The number of current frame.
    positions: Dict[str, Dict[str, Tuple[float, float, float]]]
        Nested dictionary that stores player position information for each team and
        player. Has the form `positions[team][jID] = (x, y, speed)`.
    ball: Dict[str]
        Dictionary with ball information. Has keys 'position', 'possession' and
        'ballstatus'.
    """
    # bins
    positions = {"Home": {}, "Away": {}, "Other": {}}
    ball = {}

    # split package to chunks
    chunk1, chunk2, chunk3, _ = package.split(sep=":")

    # first chunk (frame number)
    frame_number = int(chunk1)

    # second chunk (player positions)
    targets = chunk2[:-1].split(sep=";")
    for t in targets:
        player_data = t.split(sep=",")
        # type conversions
        team, system_id, jID = map(lambda x: int(x), player_data[:3])
        x, y, speed = map(lambda x: float(x), player_data[3:])
        if team == 1:
            team = "Home"
        elif team == 0:
            team = "Away"
        else:
            team = "Other"
        # assign
        positions[team][jID] = (x, y, speed)

    # third chunk (ball data)
    ball_data = chunk3.split(sep=",")[:6]
    ball["position"] = tuple(map(lambda x: float(x), ball_data[:2]))
    ball["possession"] = ball_data[4]
    ball["ballstatus"] = ball_data[5][0]

    return frame_number, positions, ball


def _frame_in_period(
    frame_number: int, periods: Dict[str, Tuple[int, int]]
) -> Union[str, None]:
    """Checks if a given frame is within the range of start- and endframe for all
    periods and returns the name of the period the frame belongs to, or None if it
    can't find any.

    Parameters
    ----------
    frame_number: int
        Frame number to be checked.
    periods: Dict[str, Tuple[int, int]]
        Dictionary with period start- and endframes of the form
        `periods[segment] = (startframe, endframe)` as it is returned by
        :meth:`floodlight.io.tracab._read_metadata`.

    Returns
    -------
    segment: str or None
        Name of the segment the frame belongs to, or None if it does not belong to any
        of the supplied segments.
    """
    # determine current segment by iterating through all segments (i)
    segment = None
    for i in periods.keys():
        if frame_number in range(periods[i][0], periods[i][1] + 1):
            segment = i

    return segment


def _read_dat_jersey_numbers(filepath_dat: Union[str, Path]):
    """Reads entire TRACAB .dat file and extracts unique set of jIDs (jerseynumbers)
    for both teams.

    Parameters
    ----------
    filepath_dat: str or pathlib.Path
        Full path to .dat file.

    Returns
    -------
    home_jIDs: set
    away_jIDs: set
    """
    # bins
    home_jIDs = set()
    away_jIDs = set()
    # loop
    with open(str(filepath_dat), "r") as f:
        while True:
            package = f.readline()
            # terminate if at end of file
            if len(package) == 0:
                break
            # read line
            _, positions, _ = _read_dat_single_line(package)
            # Extract jersey numbers
            home_jIDs |= positions["Home"].keys()
            away_jIDs |= positions["Away"].keys()

    return home_jIDs, away_jIDs



[docs]
def read_teamsheets_from_dat(filepath_dat: Union[str, Path]) -> Dict[str, Teamsheet]:
    """Parses the entire TRACAB .dat file for unique jIDs (jerseynumbers) and creates
    respective teamsheets for the home and the away team.

    Parameters
    ----------
    filepath_dat: str or pathlib.Path
        Full path to .dat file.

    Returns
    -------
    teamsheets: Dict[str, Teamsheet]
        Dictionary with teamsheets for the home team and the away team.
    """
    # bin
    teamsheets = {}

    # get jerseynumbers (jIDs)
    homejrsy, awayjrsy = _read_dat_jersey_numbers(filepath_dat)

    # loop through teams
    for team, jIDs in zip(("Home", "Away"), (homejrsy, awayjrsy)):
        jIDs = list(jIDs)
        jIDs.sort()
        player = [f"Player {i+1}" for i in range(len(jIDs))]
        teamsheet = pd.DataFrame(
            data={
                "player": player,
                "jID": jIDs,
            }
        )
        teamsheet = Teamsheet(teamsheet)
        teamsheets[team] = teamsheet

    return teamsheets




[docs]
def read_teamsheets_from_meta_json(
    filepath_metadata: Union[str, Path]
) -> Dict[str, Teamsheet]:
    """Reads TRACAB's metadata file (json format) and creates respective teamsheets for
    the home and the away team.

    Parameters
    ----------
    filepath_metadata: str or pathlib.Path
        Full path to _metadata.json file.

    Returns
    -------
    teamsheets: Dict[str, Teamsheet]
        Dictionary with teamsheets for the home team and the away team.
    """
    # load file
    with open(filepath_metadata, "r", encoding="utf8") as f:
        metafile = json.load(f)

    # param
    teams = ["Home", "Away"]

    # bin
    teamsheets = {team: {var: [] for var in ["player", "pID", "jID"]} for team in teams}

    # loop through teams
    for team in teams:
        team_name = team + "Team"
        for player in metafile[team_name]["Players"]:
            first_name = get_and_convert(player, "FirstName", str, "")
            last_name = get_and_convert(player, "LastName", str, "")
            full_name = first_name + " " + last_name
            teamsheets[team]["player"].append(full_name)
            teamsheets[team]["pID"].append(get_and_convert(player, "PlayerID", int))
            teamsheets[team]["jID"].append(get_and_convert(player, "JerseyNo", int))

        teamsheets[team] = Teamsheet(pd.DataFrame(teamsheets[team]))

    return teamsheets




[docs]
def read_position_data_dat(
    filepath_dat: Union[str, Path],
    filepath_metadata: Union[str, Path],
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[
    Dict[str, Dict[str, XY]],
    Dict[str, Code],
    Dict[str, Code],
    Dict[str, Teamsheet],
    Pitch,
]:
    """Parse TRACAB .dat-files (ASCII) and metadata (xml or json) and extract position
    data, possession and ballstatus codes, teamsheets as well as pitch information.

    ChyronHego's TRACAB system delivers two separate files, a .dat file containing the
    actual data as well as a metadata.xml containing information about pitch size,
    framerate and start- and endframes of match periods. This function provides a
    high-level access to TRACAB data by parsing "the full match" given both files.

    Parameters
    ----------
    filepath_dat: str or pathlib.Path
        Full path to dat-file.
    filepath_metadata: str or pathlib.Path
        Full path to metadata.xml file.
    teamsheet_home: Teamsheet, optional
        Teamsheet object for the home team used to create link dictionaries of the form
        `links[team][jID] = xID`. The links are used to map players to a specific xID
        in the respective XY objects. Should be supplied for custom ordering. If given
        as None (default), teamsheet is extracted from the .dat or .json file (see
        Notes) and xIDs are assigned to the player's jersey numbers ascendingly (dat
        case) or in order of appearance (json case).
    teamsheet_away: Teamsheet, optional
        Teamsheet object for the away team. If given as None (default), teamsheet is
        extracted from the .dat or .json file. See teamsheet_home for details.

    Returns
    -------
    data_objects: Tuple[Dict[str, Dict[str, XY]], Dict[str, Code], Dict[str, Code], \
     Dict[str, Teamsheet], Pitch]
        Tuple of (nested) floodlight core objects with shape (xy_objects,
        possession_objects, ballstatus_objects, teamsheets, pitch).

        ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
        and segment of the form ``xy_objects[segment][team] = XY``. For a typical
        league match with two halves and teams this dictionary looks like:
        ``{'HT1': {'Home': XY, 'Away': XY}, 'HT2': {'Home': XY, 'Away': XY}}``.

        ``possession_objects`` is a dictionary containing ``Code`` objects with
        possession information (home or away) for each segment of the form
        ``possession_objects[segment] = Code``.

        ``ballstatus_objects`` is a dictionary containing ``Code`` objects with
        ballstatus information (dead or alive) for each segment of the form
        ``ballstatus_objects[segment] = Code``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

        ``pitch`` is a ``Pitch`` object corresponding to the data.

    Notes
    -----
    Tracab provides metadata in two file types: xml and json. The json metadata files
    typically include player information whereas the xml files do not. The dat file
    storing tracking data (e.g. from an ASCII stream) contain only player jersey
    numbers, but no additional player information.

    This function will check whether the provided ``filepath_metadata`` points to a xml
    or json file. If it's a json, teamsheets are generated from this source. If it's a
    xml, teamsheets are generated from the dat file and players are named 'Player i'
    with i starting at 1. To identify players in this case, use the jersey numbers or
    provide custom teamsheets generated by a different parser if additional data is
    available.
    """
    # check file type of metadata
    file_extension = filepath_metadata.split(".")[-1].upper()

    # read metadata and determine logic used for teamsheet parsing
    if file_extension == "XML":
        metadata, periods, pitch = _read_metadata_from_xml(filepath_metadata)
        teamsheet_parse_func = read_teamsheets_from_dat
        teamsheet_parse_file = filepath_dat
    elif file_extension == "JSON":
        metadata, periods, pitch = _read_metadata_from_json(filepath_metadata)
        teamsheet_parse_func = read_teamsheets_from_meta_json
        teamsheet_parse_file = filepath_metadata
    else:
        raise ValueError(
            f"Expected metadata file type to be from [XML, JSON], got {file_extension}."
        )
    segments = list(periods.keys())

    # create or check teamsheet objects with select teamsheet parsing functions & file
    if teamsheet_home is None and teamsheet_away is None:
        teamsheets = teamsheet_parse_func(teamsheet_parse_file)
        teamsheet_home = teamsheets["Home"]
        teamsheet_away = teamsheets["Away"]
    elif teamsheet_home is None:
        teamsheets = teamsheet_parse_func(teamsheet_parse_file)
        teamsheet_home = teamsheets["Home"]
    elif teamsheet_away is None:
        teamsheets = teamsheet_parse_func(teamsheet_parse_file)
        teamsheet_away = teamsheets["Away"]
    else:
        pass
        # potential check

    # create links
    if "xID" not in teamsheet_home.teamsheet.columns:
        teamsheet_home.add_xIDs()
    if "xID" not in teamsheet_away.teamsheet.columns:
        teamsheet_away.add_xIDs()
    links_jID_to_xID = {
        "Home": teamsheet_home.get_links("jID", "xID"),
        "Away": teamsheet_away.get_links("jID", "xID"),
    }

    # infer data array shapes
    number_of_home_players = max(links_jID_to_xID["Home"].values()) + 1
    number_of_away_players = max(links_jID_to_xID["Away"].values()) + 1
    number_of_frames = {}
    for segment in segments:
        start = periods[segment][0]
        end = periods[segment][1]
        number_of_frames[segment] = end - start + 1

    # bins
    xydata = {}
    xydata["Home"] = {
        segment: np.full(
            [number_of_frames[segment], number_of_home_players * 2], np.nan
        )
        for segment in segments
    }
    xydata["Away"] = {
        segment: np.full(
            [number_of_frames[segment], number_of_away_players * 2], np.nan
        )
        for segment in segments
    }
    xydata["Ball"] = {
        segment: np.full([number_of_frames[segment], 2], np.nan) for segment in segments
    }
    codes = {
        code: {segment: [] for segment in segments}
        for code in ["possession", "ballstatus"]
    }

    # loop
    with open(filepath_dat, "r") as f:
        while True:
            package = f.readline()
            # terminate if at end of file
            if len(package) == 0:
                break
            # read line to get absolute frame (in file), player positions and ball info
            frame_abs, positions, ball = _read_dat_single_line(package)

            # check if frame is in any segment
            segment = _frame_in_period(frame_abs, periods)
            if segment is None:
                # skip line if not
                continue
            else:
                # otherwise calculate relative frame (in respective segment)
                frame_rel = frame_abs - periods[segment][0]

            # insert (x,y)-data into correct np.array, at correct place (t, xID)
            for team in ["Home", "Away"]:
                for jID in positions[team].keys():
                    # map jersey number to array index and infer respective columns
                    x_col = (links_jID_to_xID[team][jID]) * 2
                    y_col = (links_jID_to_xID[team][jID]) * 2 + 1
                    xydata[team][segment][frame_rel, x_col] = positions[team][jID][0]
                    xydata[team][segment][frame_rel, y_col] = positions[team][jID][1]

            # get ball data
            xydata["Ball"][segment][frame_rel,] = ball["position"]
            codes["possession"][segment].append(ball.get("possession", np.nan))
            codes["ballstatus"][segment].append(ball.get("ballstatus", np.nan))

    # create objects
    xy_objects = {}
    possession_objects = {}
    ballstatus_objects = {}
    for segment in segments:
        xy_objects[segment] = {}
        possession_objects[segment] = Code(
            code=np.array(codes["possession"][segment]),
            name="possession",
            definitions={"H": "Home", "A": "Away"},
            framerate=metadata["framerate"],
        )
        ballstatus_objects[segment] = Code(
            code=np.array(codes["ballstatus"][segment]),
            name="ballstatus",
            definitions={"D": "Dead", "A": "Alive"},
            framerate=metadata["framerate"],
        )
        for team in ["Home", "Away", "Ball"]:
            xy_objects[segment][team] = XY(
                xy=xydata[team][segment], framerate=metadata["framerate"]
            )
    teamsheets = {
        "Home": teamsheet_home,
        "Away": teamsheet_away,
    }

    # pack objects
    data_objects = (
        xy_objects,
        possession_objects,
        ballstatus_objects,
        teamsheets,
        pitch,
    )

    return data_objects