Source code for

from pathlib import Path
from typing import Dict, Union

import datetime
import json
import pandas as pd

from floodlight import Events
from import get_and_convert

[docs]def read_event_data_json( filepath_events: Union[str, Path] ) -> Dict[str, Dict[str, Dict]]: """Parses the Sportradar timeline files in json format and extracts the event data. This function provides access to `Sport Event Timeline < Handball_v2#sport-event-timeline>`_ files from the data provider `Sportradar <>`_ exported in json format and returns Event objects for all teams and segments of the game. Parameters ---------- filepath_events: str or pathlib.Path Full path to json file where the Sport Event Timeline is saved. Returns ------- data_objects: Dict[str: Dict[str: Events]] Nested dictionary with ``Events`` objects for all teams and segments. The returned dictionary contains one dictionary per segment, which in return contain one ``Events`` object per team. For a usual league match with two halves and two teams this dictionary looks like: ``{"HT1": {"Home": Events, "Away": Events}, "HT2": {Home: Events, Away: Events}}`` Notes ----- Sportradar provides different information depending on the respective Event type. This parser itemizes top-level information for each possible event type listed in the `Handball v2 documentation FAQ < Handball_v2#frequently-asked-questions>`_ (most recent check: 11.01.2023). For example, this involves individual columns for the home and away score parsed event type *score_change*. However, individual columns for players involved in Events, like *seven_m_awarded* or *shots* are not fully itemized, as they can contain different information depending on the situation. More complex information that changes per event type is instead included as dict or list of dicts in according column, so they can be accessed if necessary. In the return, the following columns contain temporal information about the event: ``("gameclock", "time_stamp", "minutes_gross", "seconds_gross", "minutes", "seconds" )``. In handball, the match-clock determines the net playing time (60 minutes) and diverges from the gross "real world" time passed. The "gameclock" column contains the gross time passed in seconds in relation to the start of the respective segment. The "minute_gross" and "second_gross" columns contain the "gameclock" converted to minutes and seconds, respectively. The columns "minutes" and "seconds" contain the information about the net match-clock. the column "time_stamp" contains the global time-stamp of the respective event in the ISO 8601 standard format. The column "outcome" in the return contains the "outcome" information in the raw event data and not information about the success {0, 1} of an event. The outcome in terms of success can be inferred by the ``eID``. E.g. "score_change" implies that a shot lead to a goal, "shot_saved" implies that a goal was not scored. """ # load full json into memory with open(str(filepath_events)) as f: events = json.load(f) # check if timeline data exists if "timeline" not in events: raise ValueError("There appears to be no timeline data in this file.") else: timeline = events["timeline"] # extract match id mID = events["sport_event"]["id"] # create links from home/away to team id and team name teams = ["Home", "Away"] home_away_link = {} for competitor in events["statistics"]["totals"]["competitors"]: home_away_link.update( { competitor["qualifier"].capitalize(): ( competitor["id"], competitor["name"], ) } ) # extract periods periods = sorted( set( [ event["period_name"] for event in timeline if event["type"] == "period_start" ] ) ) # create team event dict columns = [ "eID", "gameclock", "time_stamp", "minute_gross", "second_gross", "minute", "second", "pID", "player_name", "tID", "team_name", "mID", "home_score", "away_score", "scorer", "assists", "zone", "shot_type", "outcome", "players", ] segments = [f"HT{period[0]}" for period in periods] team_event_lists = { team: {segment: {col: [] for col in columns} for segment in segments} for team in teams } period = None # loop through event timeline for event in timeline: if period is None: # get first period if event["type"] == "period_start": period = event["period_name"] segment = f"HT{period[0]}" segment_start = datetime.datetime.fromisoformat(event["time"]) else: # skip events before first period starts continue # get new periods else: if event["type"] == "period_start": period = event["period_name"] segment = f"HT{period[0]}" segment_start = datetime.datetime.fromisoformat(event["time"]) # extract event, player and team ids and names eID = event["type"] # add all teams as competitors if no competitor is specified in event competitor = ( [event["competitor"].capitalize()] if "competitor" in event else teams ) tID = home_away_link[competitor[0]][0] if len(competitor) == 1 else None team_name = home_away_link[competitor[0]][1] if len(competitor) == 1 else None pID = event["player"]["id"] if "player" in event else None player_name = event["player"]["name"] if "player" in event else None # extract time codes and match-clock time_stamp = datetime.datetime.fromisoformat(event["time"]) time_delta = time_stamp - segment_start gameclock = time_delta.seconds minute_gross = int(gameclock / 60) second_gross = int(gameclock % 60) if "match_clock" in event: match_clock = event["match_clock"] minute, second = [int(x) for x in match_clock.split(":")] else: minute, second = (None, None) # extract optional event information outcome = get_and_convert(event, "outcome", str) home_score = get_and_convert(event, "home_score", int) away_score = get_and_convert(event, "away_score", int) scorer = get_and_convert(event, "scorer", Dict) assists = get_and_convert(event, "assists", list) zone = get_and_convert(event, "zone", str) shot_type = get_and_convert(event, "shot_type", str) players = get_and_convert(event, "players", list) # add event to team event list for team in competitor: team_event_lists[team][segment]["eID"].append(eID) team_event_lists[team][segment]["gameclock"].append(gameclock) team_event_lists[team][segment]["time_stamp"].append(time_stamp) team_event_lists[team][segment]["minute_gross"].append(minute_gross) team_event_lists[team][segment]["second_gross"].append(second_gross) team_event_lists[team][segment]["minute"].append(minute) team_event_lists[team][segment]["second"].append(second) team_event_lists[team][segment]["pID"].append(pID) team_event_lists[team][segment]["player_name"].append(player_name) team_event_lists[team][segment]["tID"].append(tID) team_event_lists[team][segment]["team_name"].append(team_name) team_event_lists[team][segment]["mID"].append(mID) team_event_lists[team][segment]["home_score"].append(home_score) team_event_lists[team][segment]["away_score"].append(away_score) team_event_lists[team][segment]["scorer"].append(scorer) team_event_lists[team][segment]["assists"].append(assists) team_event_lists[team][segment]["zone"].append(zone) team_event_lists[team][segment]["shot_type"].append(shot_type) team_event_lists[team][segment]["outcome"].append(outcome) team_event_lists[team][segment]["players"].append(players) # flexible parser return for all segments and teams data_objects = { segment: { team: Events(events=pd.DataFrame(data=team_event_lists[team][segment])) for team in teams } for segment in segments } return data_objects