Source code for floodlight.io.secondspectrum

import json
from pathlib import Path
from typing import Dict, Tuple, Union

import pytz
import iso8601
import numpy as np
import pandas as pd

from floodlight.core.events import Events
from floodlight.core.code import Code
from floodlight.core.pitch import Pitch
from floodlight.core.xy import XY
from floodlight.core.teamsheet import Teamsheet
from floodlight.io.utils import get_and_convert


def _get_position_precedence(position: str) -> int:
    """Get the precedence of a position abbreviation (e.g. 'GK') useful for ordering."""
    PLAYER_POSITION_PRECEDENCE = {
        "GK": 1,
        "LB": 2,
        "LCB": 3,
        "CB": 4,
        "RCB": 5,
        "RB": 6,
        "LWB": 7,
        "LDM": 8,
        "CDM": 9,
        "RDM": 10,
        "RWB": 11,
        "LM": 12,
        "LCM": 13,
        "CM": 14,
        "RCM": 15,
        "RM": 16,
        "LW": 17,
        "CAM": 18,
        "RW": 19,
        "LF": 20,
        "LCF": 21,
        "CF": 22,
        "RCF": 23,
        "RF": 24,
        "SUB": 99,
    }
    # unknown positions are inserted between known positions and substitutes
    precedence = PLAYER_POSITION_PRECEDENCE.get(position, 23)

    return precedence


def _read_metajson(
    filepath_metadata: Union[str, Path]
) -> Tuple[Dict, Dict, Dict, Pitch]:
    """Reads Second Spectrum's metadata file and extracts information about match
    metainfo, periods, playing directions, and the pitch.

    Parameters
    ----------
    filepath_metadata: str or pathlib.Path
        Full path to _meta.json file.

    Returns
    -------
    metadata: dict
        Dictionary with meta information such as framerate.
    periods: Dict[Tuple[int, int]]
        Dictionary with start and endframes of all segments, e.g.,
        ``periods[segment] = (startframe, endframe)``.
    directions: Dict[str, Dict[str, str]]
        Dictionary with playing direction information of all segments and teams,
        e.g., ``directions[segment][team] = 'lr'``
    pitch: Pitch
        Pitch object with actual pitch length and width.
    """
    # read file
    with open(str(filepath_metadata), "r") as f:
        metajson = json.load(f)

    # bin
    metadata = {}
    periods = {}
    directions = {}

    # assemble metadata
    metadata["framerate"] = metajson.get("fps", None)
    metadata["length"] = metajson.get("pitchLength", None)
    metadata["width"] = metajson.get("pitchWidth", None)
    metadata["home_tID"] = get_and_convert(metajson, "homeOptaId", int)
    metadata["away_tID"] = get_and_convert(metajson, "awayOptaId", int)

    # get period information
    for period in metajson["periods"]:
        segment = f"HT{period['number']}"
        periods[segment] = (period["startFrameIdx"], period["endFrameIdx"] + 1)

    # get playing direction
    for period in metajson["periods"]:
        segment = f"HT{period['number']}"
        directions[segment] = {}
        if period["homeAttPositive"]:
            directions[segment]["Home"] = "lr"
            directions[segment]["Away"] = "rl"
        else:
            directions[segment]["Home"] = "rl"
            directions[segment]["Away"] = "lr"

    # generate pitch object
    pitch = Pitch.from_template(
        template_name="secondspectrum",
        length=metadata["length"],
        width=metadata["width"],
        sport="football",
    )

    return metadata, periods, directions, pitch



[docs]
def read_teamsheets_from_meta_json(
    filepath_metadata: Union[str, Path]
) -> Dict[str, Teamsheet]:
    """Parses the Second Spectrum meta.json-file and creates respective teamsheets for
    the home and the away team.

    Parameters
    ----------
    filepath_metadata: str or pathlib.Path
        Full path to _meta.json file.

    Returns
    -------
    teamsheets: Dict[str, Teamsheet]
        Dictionary with teamsheets for the home team and the away team.

    Notes
    -----
    The ordering of players is determined by position precedence, with the following
    precedence values assigned to Second Spectrum's player position information::

        {
            'GK': 1,
            'LB': 2,
            'LCB': 3,
            'CB': 4,
            'RCB': 5,
            'RB': 6,
            'LWB': 7,
            'LDM': 8,
            'CDM': 9,
            'RDM': 10,
            'RWB': 11,
            'LM': 12,
            'LCM': 13,
            'CM': 14,
            'RCM': 15,
            'RM': 16,
            'LW': 17,
            'CAM': 18,
            'RW': 19,
            'LF': 20,
            'LCF': 21,
            'CF': 22,
            'RCF': 23,
            'RF': 24,
            'SUB': 99
        }
    """
    # read file
    with open(str(filepath_metadata), "r") as f:
        metajson = json.load(f)

    # param
    key_map = {"Home": "homePlayers", "Away": "awayPlayers"}

    # bin
    teamsheets = {team: None for team in ["Home", "Away"]}

    # loop through teams
    for team in ["Home", "Away"]:
        # bin
        teamsheet = {
            column: [] for column in ["precedence", "player", "jID", "pID", "position"]
        }
        # query team player list
        player_list = metajson[key_map[team]]
        # add players to list
        for player in player_list:
            # query
            name = get_and_convert(player, "name", str)
            position = get_and_convert(player, "position", str)
            jID = get_and_convert(player, "number", int)
            pID = get_and_convert(player, "optaId", int)
            precedence = _get_position_precedence(position)
            # assign
            teamsheet["player"].append(name)
            teamsheet["position"].append(position)
            teamsheet["jID"].append(jID)
            teamsheet["pID"].append(pID)
            teamsheet["precedence"].append(precedence)
        # curate
        teamsheet = pd.DataFrame(teamsheet)
        teamsheet.sort_values("precedence", inplace=True)
        teamsheet.drop(["precedence"], axis=1, inplace=True)
        teamsheet.reset_index(drop=True, inplace=True)
        teamsheet = Teamsheet(teamsheet)
        teamsheets[team] = teamsheet

    return teamsheets




[docs]
def read_position_data_jsonl(
    filepath_position: Union[str, Path],
    filepath_metadata: Union[str, Path],
    teamsheet_home: Teamsheet = None,
    teamsheet_away: Teamsheet = None,
) -> Tuple[
    Dict[str, Dict[str, XY]],
    Dict[str, Code],
    Dict[str, Code],
    Dict[str, Teamsheet],
    Pitch,
]:
    """Parse Second Spectrum files and extract position data, possession and ballstatus
    codes, as well as pitch information.

    Second Spectrum data is typically stored in two separate files, a .jsonl file
    containing the actual data as well as a _meta.json containing information about
    pitch size, framerate, lineups and start- and endframes of match periods. This
    function provides a high-level access to Second Spectrum data by parsing "the full
    match" given both files.

    Parameters
    ----------
    filepath_position: str or pathlib.Path
        Full path to .jsonl-file.
    filepath_metadata: str or pathlib.Path
        Full path to _meta.json file.
    teamsheet_home: Teamsheet, optional
        Teamsheet object for the home team used to create link dictionaries of the form
        `links[team][jID] = xID`. The links are used to map players to a specific xID
        in the respective XY objects. Should be supplied for custom ordering. If given
        as None (default), teamsheet is extracted from the meta.json file and xIDs are
        assigned based on the ordering determined by the
        ``read_teamsheets_from_metajson`` function (see for details).
    teamsheet_away: Teamsheet, optional
        Teamsheet object for the away team. If given as None (default), teamsheet is
        extracted from the meta.json-file. See teamsheet_home for details.

    Returns
    -------
    data_objects: Tuple[Dict[str, Dict[str, XY]], Dict[str, Code], Dict[str, Code], \
     Dict[str, Teamsheet], Pitch]
        Tuple of (nested) floodlight core objects with shape (xy_objects,
        possession_objects, ballstatus_objects, teamsheets, pitch).

        ``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
        and segment of the form ``xy_objects[segment][team] = XY``. For a typical
        league match with two halves and teams this dictionary looks like:
        ``{'HT1': {'Home': XY, 'Away': XY}, 'HT2': {'Home': XY, 'Away': XY}}``.

        ``possession_objects`` is a dictionary containing ``Code`` objects with
        possession information (home or away) for each segment of the form
        ``possession_objects[segment] = Code``.

        ``ballstatus_objects`` is a dictionary containing ``Code`` objects with
        ballstatus information (dead or alive) for each segment of the form
        ``ballstatus_objects[segment] = Code``.

        ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
        of the form ``teamsheets[team] = Teamsheet``.

        ``pitch`` is a ``Pitch`` object corresponding to the data.
    """
    # setup
    metadata, periods, directions, pitch = _read_metajson(str(filepath_metadata))
    segments = list(periods.keys())
    teams = ["Home", "Away"]
    fps = int(metadata["framerate"])
    status_link = {True: "A", False: "D"}
    key_map = {"Home": "homePlayers", "Away": "awayPlayers"}

    # create or check teamsheet objects
    if teamsheet_home is None and teamsheet_away is None:
        teamsheets = read_teamsheets_from_meta_json(filepath_metadata)
        teamsheet_home = teamsheets["Home"]
        teamsheet_away = teamsheets["Away"]
    elif teamsheet_home is None:
        teamsheets = read_teamsheets_from_meta_json(filepath_metadata)
        teamsheet_home = teamsheets["Home"]
    elif teamsheet_away is None:
        teamsheets = read_teamsheets_from_meta_json(filepath_metadata)
        teamsheet_away = teamsheets["Away"]
    else:
        pass
        # potential check

    # create links
    if "xID" not in teamsheet_home.teamsheet.columns:
        teamsheet_home.add_xIDs()
    if "xID" not in teamsheet_away.teamsheet.columns:
        teamsheet_away.add_xIDs()
    links_jID_to_xID = {
        "Home": teamsheet_home.get_links("jID", "xID"),
        "Away": teamsheet_away.get_links("jID", "xID"),
    }

    # bins
    xydata = {
        team: {
            segment: np.full(
                [
                    periods[segment][1] - periods[segment][0],  # T frames in segment
                    len(links_jID_to_xID[team]) * 2,  # N players in team
                ],
                np.nan,
            )
            for segment in segments
        }
        for team in teams
    }
    xydata["Ball"] = {
        segment: np.full([periods[segment][1] - periods[segment][0], 2], np.nan)
        for segment in segments
    }
    codes = {
        code: {
            segment: np.full(
                [periods[segment][1] - periods[segment][0]], np.nan, dtype=object
            )
            for segment in segments
        }
        for code in ["possession", "ballstatus"]
    }

    # loop
    with open(str(filepath_position), "r") as f:
        while True:
            # get one line of file
            dataline = f.readline()
            # terminate if at end of file
            if len(dataline) == 0:
                break
            # load json
            dataline = json.loads(dataline)

            # get dataline meta information and correct for frame offset
            segment = f"HT{dataline['period']}"
            frame_abs = dataline["frameIdx"]
            frame_rel = frame_abs - periods[segment][0]

            # insert (x,y)-data into correct np.array, at correct place (t, xID)
            for team in teams:
                player_data = dataline[key_map[team]]
                for player in player_data:
                    # map jersey number to array index and infer respective columns
                    jID = player["number"]
                    x_col = (links_jID_to_xID[team][jID]) * 2
                    y_col = (links_jID_to_xID[team][jID]) * 2 + 1
                    xydata[team][segment][frame_rel, (x_col, y_col)] = player["xyz"][:2]

            # get ball data
            if "ball" in dataline and dataline["ball"].get("xyz") is not None:
                xydata["Ball"][segment][frame_rel] = dataline["ball"]["xyz"][:2]

            # get codes
            if "lastTouch" in dataline:
                possession = dataline["lastTouch"][0].upper()
            else:
                possession = None
            codes["possession"][segment][frame_rel] = possession
            if "live" in dataline:
                ballstatus = status_link[dataline["live"]]
            else:
                ballstatus = None
            codes["ballstatus"][segment][frame_rel] = ballstatus

    # create objects
    xy_objects = {}
    possession_objects = {}
    ballstatus_objects = {}
    for segment in segments:
        xy_objects[segment] = {}
        possession_objects[segment] = Code(
            code=codes["possession"][segment],
            name="possession",
            definitions={"H": "Home", "A": "Away"},
            framerate=fps,
        )
        ballstatus_objects[segment] = Code(
            code=codes["ballstatus"][segment],
            name="ballstatus",
            definitions={"D": "Dead", "A": "Alive"},
            framerate=fps,
        )
        for team in ["Home", "Away"]:
            xy_objects[segment][team] = XY(
                xy=xydata[team][segment],
                framerate=fps,
                direction=directions[segment][team],
            )
        xy_objects[segment]["Ball"] = XY(xy=xydata["Ball"][segment], framerate=fps)
    teamsheets = {
        "Home": teamsheet_home,
        "Away": teamsheet_away,
    }

    # pack objects
    data_objects = (
        xy_objects,
        possession_objects,
        ballstatus_objects,
        teamsheets,
        pitch,
    )

    return data_objects




[docs]
def read_event_data_jsonl(
    filepath_insight: Union[str, Path],
    filepath_metadata: Union[str, Path],
) -> Tuple[Dict[str, Dict[str, Events]], Pitch]:
    """Parse Second Spectrum's Insight file (containing match events) and extract
    event data and pitch information.

    This function provides a high-level access to the particular Second Spectrum
    Insight file and will return event objects for both teams. The number of segments is
    inferred from the data, yet data for each segment is stored in a separate object.

    Parameters
    ----------
    filepath_insight: str or pathlib.Path
        Full path to .jsonl-file.
    filepath_metadata: str or pathlib.Path
        Full path to _meta.json file.

    Returns
    -------
    data_objects: Tuple[Dict[str, Dict[str, Events]], Pitch]
        Tuple of (nested) floodlight core objects with shape (events_objects,
        pitch).

        ``events_objects`` is a nested dictionary containing ``Events`` objects for
        each team and segment of the form ``events_objects[segment][team] = Events``.
        For a typical league match with two halves and teams this dictionary looks like:
        ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
        Events}}``.

        ``pitch`` is a ``Pitch`` object corresponding to the data.

    Notes
    -----
    Second Spectrum's Insight files can be seen as a union of (wrapped) Opta event feeds
    and attached Second Spectrum markings. Thus, properties of the Opta F24-feed parser
    also mostly apply to this parser. This particularly includes the handling of
    qualifiers, which are included as a string in the ``qualifier`` column of the
    returned DataFrame's. Second Spectrum markings are disregarded at this moment, but
    could be included in future releases.
    """
    # 1. assemble meta info
    metadata, periods, _, _ = _read_metajson(filepath_metadata)
    teams = ["Home", "Away"]
    tID_link = {
        metadata["home_tID"]: "Home",
        metadata["away_tID"]: "Away",
    }
    segments = periods.keys()

    # 2. parse events
    # bins
    columns = [
        "eID",
        "gameclock",
        "pID",
        "outcome",
        "timestamp",
        "minute",
        "second",
        "at_x",
        "at_y",
        "qualifier",
    ]
    event_lists = {
        team: {segment: {col: [] for col in columns} for segment in segments}
        for team in teams
    }
    directions = {team: {} for team in teams}
    dir_link = {"Left to Right": "lr", "Right to Left": "rl"}
    segment_offsets = {1: 0, 2: 45, 3: 90, 4: 105}
    kickoffs = {}

    # parse file for meta events (directions and kick-offs)
    with open(str(filepath_insight), "r") as f:
        while True:
            # get one line of file
            dataline = f.readline()
            # terminate if at end of file
            if len(dataline) == 0:
                break
            # load json
            dataline = json.loads(dataline)
            optaline = dataline["optaEvent"]
            if optaline is None:
                continue
            if optaline["typeId"] == 32:
                # get team and segment information
                period = get_and_convert(optaline, "periodId", int)
                segment = "HT" + str(period)
                tID = get_and_convert(optaline, "opContestantId", int)
                team = tID_link[tID]
                # read kickoff times
                kickoff_timestring = get_and_convert(optaline, "timeStamp", str)
                kickoff_datetime = iso8601.parse_date(
                    kickoff_timestring, default_timezone=pytz.utc
                )
                kickoffs[segment] = kickoff_datetime
                # read playing direction
                direction = None
                for qualifier in get_and_convert(optaline, "qualifier", list, []):
                    if get_and_convert(qualifier, "qualifierId", int) == 127:
                        value = get_and_convert(qualifier, "value", str)
                        direction = dir_link.get(value)
                directions[team][segment] = direction

    # parse file for match events
    with open(str(filepath_insight), "r") as f:
        while True:
            # get one line of file
            dataline = f.readline()
            # terminate if at end of file
            if len(dataline) == 0:
                break
            # load json
            dataline = json.loads(dataline)
            optaline = dataline["optaEvent"]
            # secspecline = dataline["2sMarking"]
            if optaline is None:
                continue

            # get team and segment information
            period = get_and_convert(optaline, "periodId", int)
            segment = "HT" + str(period)
            tID = get_and_convert(optaline, "opContestantId", int)
            team = tID_link[tID]

            # skip match-unrelated events
            if period not in range(1, 6):
                continue

            # identifier and outcome:
            eID = get_and_convert(optaline, "typeId", int)
            # skip unwanted events
            if eID in [30]:
                continue
            pID = get_and_convert(optaline, "opPlayerId", int)
            outcome = get_and_convert(optaline, "outcome", int)
            event_lists[team][segment]["eID"].append(eID)
            event_lists[team][segment]["pID"].append(pID)
            event_lists[team][segment]["outcome"].append(outcome)

            # absolute and relative time
            event_timestring = get_and_convert(optaline, "timeStamp", str)
            minute = get_and_convert(optaline, "timeMin", int)
            # transform minute to be relative to current segment
            minute -= segment_offsets[period]
            second = get_and_convert(optaline, "timeSec", int)
            timestamp = iso8601.parse_date(event_timestring, default_timezone=pytz.utc)
            delta = timestamp - kickoffs[segment]
            gameclock = delta.total_seconds()
            # re-adjust pre-kick-off events (e.g. substitutions) to 00:00
            gameclock = max(gameclock, 0.0)
            event_lists[team][segment]["timestamp"].append(timestamp)
            event_lists[team][segment]["minute"].append(minute)
            event_lists[team][segment]["second"].append(second)
            event_lists[team][segment]["gameclock"].append(gameclock)

            # location
            at_x = get_and_convert(optaline, "x", float)
            at_y = get_and_convert(optaline, "y", float)
            event_lists[team][segment]["at_x"].append(at_x)
            event_lists[team][segment]["at_y"].append(at_y)

            # qualifier
            qual_dict = {}
            for qualifier in get_and_convert(optaline, "qualifier", list, []):
                qual_id = get_and_convert(qualifier, "qualifierId", int)
                qual_value = qualifier.get("value")
                qual_dict[qual_id] = qual_value
            event_lists[team][segment]["qualifier"].append(str(qual_dict))

    # create objects
    events_objects = {}
    for segment in segments:
        events_objects[segment] = {}
        for team in ["Home", "Away"]:
            events_objects[segment][team] = Events(
                events=pd.DataFrame(data=event_lists[team][segment]),
                direction=directions[team][segment],
            )
    pitch = Pitch.from_template(
        "opta", length=metadata["length"], width=metadata["width"], sport="football"
    )

    # pack objects
    data_objects = (events_objects, pitch)

    return data_objects