Source code for floodlight.io.sportradar

from pathlib import Path
from typing import Dict, Union

import datetime
import json
import pandas as pd

from floodlight import Events
from floodlight.io.utils import get_and_convert



[docs]
def read_event_data_json(
    filepath_events: Union[str, Path]
) -> Dict[str, Dict[str, Dict]]:
    """Parses the Sportradar timeline files in json format and extracts the event data.

    This function provides access to `Sport Event Timeline
    <https://developer.sportradar.com/docs/read/handball/
    Handball_v2#sport-event-timeline>`_ files from the data provider `Sportradar
    <https://sportradar.com/>`_ exported in json format and returns Event objects for
    all teams and segments of the game.

    Parameters
    ----------
    filepath_events: str or pathlib.Path
        Full path to json file where the Sport Event Timeline is saved.

    Returns
    -------
    data_objects: Dict[str: Dict[str: Events]]
        Nested dictionary with ``Events`` objects for all teams and segments. The
        returned dictionary contains one dictionary per segment, which in return contain
        one ``Events`` object per team. For a usual league match with two halves and
        two teams this dictionary looks like:
        ``{"HT1": {"Home": Events, "Away": Events}, "HT2": {Home: Events, Away:
        Events}}``

    Notes
    -----
    Sportradar provides different information depending on the respective Event type.
    This parser itemizes top-level information for each possible event type listed in
    the `Handball v2 documentation FAQ
    <https://developer.sportradar.com/docs/read/handball/
    Handball_v2#frequently-asked-questions>`_ (most recent check: 11.01.2023).

    For example, this involves individual columns for the home and away score parsed
    event type *score_change*. However, individual columns for players involved in
    Events, like *seven_m_awarded* or *shots* are not fully itemized, as they can
    contain different information depending on the situation. More complex information
    that changes per event type is instead included as dict or list of dicts in
    according column, so they can be accessed if necessary.

    In the return, the following columns contain temporal information about the event:
    ``("gameclock", "time_stamp", "minutes_gross", "seconds_gross", "minutes", "seconds"
    )``. In handball, the match-clock determines the net playing time (60 minutes) and
    diverges from the gross "real world" time passed. The "gameclock" column contains
    the gross time passed in seconds in relation to the start of the respective segment.
    The "minute_gross" and "second_gross" columns contain the "gameclock" converted to
    minutes and seconds, respectively. The columns "minutes" and "seconds" contain the
    information about the net match-clock. The column "time_stamp" contains the global
    time-stamp of the respective event in the ISO 8601 standard format.

    The column "outcome" in the return contains the "outcome" information in the raw
    event data and not information about the success {0, 1} of an event. The outcome in
    terms of success can be inferred by the ``eID``. E.g. "score_change" implies that a
    shot lead to a goal, "shot_saved" implies that a goal was not scored.
    """

    # load full json into memory
    with open(str(filepath_events)) as f:
        events = json.load(f)

    # check if timeline data exists
    if "timeline" not in events:
        raise ValueError("There appears to be no timeline data in this file.")
    else:
        timeline = events["timeline"]

    # extract match id
    mID = events["sport_event"]["id"]

    # create links from home/away to team id and team name
    teams = ["Home", "Away"]

    home_away_link = {}
    for competitor in events["statistics"]["totals"]["competitors"]:
        home_away_link.update(
            {
                competitor["qualifier"].capitalize(): (
                    competitor["id"],
                    competitor["name"],
                )
            }
        )

    # extract periods
    periods = sorted(
        set(
            [
                event["period_name"]
                for event in timeline
                if event["type"] == "period_start"
            ]
        )
    )

    # create team event dict
    columns = [
        "eID",
        "gameclock",
        "time_stamp",
        "minute_gross",
        "second_gross",
        "minute",
        "second",
        "pID",
        "player_name",
        "tID",
        "team_name",
        "mID",
        "home_score",
        "away_score",
        "scorer",
        "assists",
        "zone",
        "shot_type",
        "outcome",
        "players",
    ]

    segments = [f"HT{period[0]}" for period in periods]

    team_event_lists = {
        team: {segment: {col: [] for col in columns} for segment in segments}
        for team in teams
    }

    period = None
    # loop through event timeline
    for event in timeline:
        if period is None:
            # get first period
            if event["type"] == "period_start":
                period = event["period_name"]
                segment = f"HT{period[0]}"
                segment_start = datetime.datetime.fromisoformat(event["time"])
            else:
                # skip events before first period starts
                continue
        # get new periods
        else:
            if event["type"] == "period_start":
                period = event["period_name"]
                segment = f"HT{period[0]}"
                segment_start = datetime.datetime.fromisoformat(event["time"])

        # extract event, player and team ids and names
        eID = event["type"]

        # add all teams as competitors if no competitor is specified in event
        competitor = (
            [event["competitor"].capitalize()] if "competitor" in event else teams
        )

        tID = home_away_link[competitor[0]][0] if len(competitor) == 1 else None
        team_name = home_away_link[competitor[0]][1] if len(competitor) == 1 else None
        pID = event["player"]["id"] if "player" in event else None
        player_name = event["player"]["name"] if "player" in event else None

        # extract time codes and match-clock
        time_stamp = datetime.datetime.fromisoformat(event["time"])
        time_delta = time_stamp - segment_start
        gameclock = time_delta.seconds
        minute_gross = int(gameclock / 60)
        second_gross = int(gameclock % 60)

        if "match_clock" in event:
            match_clock = event["match_clock"]
            minute, second = [int(x) for x in match_clock.split(":")]
        else:
            minute, second = (None, None)

        # extract optional event information
        outcome = get_and_convert(event, "outcome", str)
        home_score = get_and_convert(event, "home_score", int)
        away_score = get_and_convert(event, "away_score", int)
        scorer = get_and_convert(event, "scorer", Dict)
        assists = get_and_convert(event, "assists", list)
        zone = get_and_convert(event, "zone", str)
        shot_type = get_and_convert(event, "shot_type", str)
        players = get_and_convert(event, "players", list)

        # add event to team event list
        for team in competitor:
            team_event_lists[team][segment]["eID"].append(eID)
            team_event_lists[team][segment]["gameclock"].append(gameclock)
            team_event_lists[team][segment]["time_stamp"].append(time_stamp)
            team_event_lists[team][segment]["minute_gross"].append(minute_gross)
            team_event_lists[team][segment]["second_gross"].append(second_gross)
            team_event_lists[team][segment]["minute"].append(minute)
            team_event_lists[team][segment]["second"].append(second)
            team_event_lists[team][segment]["pID"].append(pID)
            team_event_lists[team][segment]["player_name"].append(player_name)
            team_event_lists[team][segment]["tID"].append(tID)
            team_event_lists[team][segment]["team_name"].append(team_name)
            team_event_lists[team][segment]["mID"].append(mID)
            team_event_lists[team][segment]["home_score"].append(home_score)
            team_event_lists[team][segment]["away_score"].append(away_score)
            team_event_lists[team][segment]["scorer"].append(scorer)
            team_event_lists[team][segment]["assists"].append(assists)
            team_event_lists[team][segment]["zone"].append(zone)
            team_event_lists[team][segment]["shot_type"].append(shot_type)
            team_event_lists[team][segment]["outcome"].append(outcome)
            team_event_lists[team][segment]["players"].append(players)

    # flexible parser return for all segments and teams
    data_objects = {
        segment: {
            team: Events(events=pd.DataFrame(data=team_event_lists[team][segment]))
            for team in teams
        }
        for segment in segments
    }

    return data_objects