Source code for floodlight.io.statsbomb

import warnings
from pathlib import Path
from typing import Tuple, Union, Dict

import os
import json
import pandas as pd

from floodlight.core.events import Events
from floodlight.core.teamsheet import Teamsheet


[docs]def read_teamsheets_from_open_event_data_json( filepath_events: Union[str, Path], filepath_match: Union[str, Path], ) -> Dict[str, Teamsheet]: """Reads open events and match files and returns Teamsheet objects for the home and the away team. Parameters ---------- filepath_events: str or pathlib.Path Full path to json file where the Event data is saved. filepath_match: str or pathlib.Path Full path to json file where information about all matches of a season are stored. Returns ------- teamsheets: Dict[str, Teamsheet] Dictionary with teamsheets for the home team and the away team. """ # load json file into memory with open(str(filepath_match), "r", encoding="utf8") as f: matchinfo_list = json.load(f) with open(str(filepath_events), "r", encoding="utf8") as f: file_event_list = json.load(f) # retrieve match info from file mID = int(str(filepath_events).split(os.path.sep)[-1][:-5]) # from filepath matchinfo = None for info in matchinfo_list: if info["match_id"] == mID: matchinfo = info break # raise error if match is not contained in matchinfo if matchinfo is None: raise KeyError( f"The match with mID {mID} was not found in the specified " f"File of match information ({filepath_match})." ) # initialize teamsheets teamsheets = { "Home": pd.DataFrame( columns=["player", "position", "team_name", "jID", "pID", "tID"] ), "Away": pd.DataFrame( columns=["player", "position", "team_name", "jID", "pID", "tID"] ), } # find team data in match info tIDs = { "Home": matchinfo["home_team"]["home_team_id"], "Away": matchinfo["away_team"]["away_team_id"], } team_names = { "Home": matchinfo["home_team"]["home_team_name"], "Away": matchinfo["away_team"]["away_team_name"], } # parse starting eleven for event in file_event_list: if event["type"]["name"] != "Starting XI": continue # find team if event["team"]["id"] == tIDs["Home"]: team = "Home" elif event["team"]["id"] == tIDs["Away"]: team = "Away" else: team = None # find list of players players = event["tactics"]["lineup"] # add player data to teamsheets teamsheets[team]["player"] = [player["player"]["name"] for player in players] teamsheets[team]["pID"] = [player["player"]["id"] for player in players] teamsheets[team]["jID"] = [player["jersey_number"] for player in players] teamsheets[team]["position"] = [ player["position"]["name"] for player in players ] teamsheets[team]["tID"] = tIDs[team] teamsheets[team]["team_name"] = team_names[team] # parse players coming in from substitutions for event in file_event_list: if event["type"]["name"] != "Substitution": continue # find team if event["team"]["id"] == tIDs["Home"]: team = "Home" elif event["team"]["id"] == tIDs["Away"]: team = "Away" else: team = None # append player data to teamsheet player_data = pd.DataFrame( { "player": [event["substitution"]["replacement"]["name"]], "pID": [event["substitution"]["replacement"]["id"]], "jID": [pd.NA], # unfortunately not included in substitution event "position": [event["position"]["name"]], "tID": [tIDs[team]], "team_name": [team_names[team]], } ) teamsheets[team] = pd.concat((teamsheets[team], player_data), ignore_index=True) # create teamsheet objects for team in team_names.keys(): teamsheets[team] = Teamsheet(teamsheets[team]) return teamsheets
[docs]def read_open_event_data_json( filepath_events: Union[str, Path], filepath_match: Union[str, Path], filepath_threesixty: Union[str, Path] = None, teamsheet_home: Teamsheet = None, teamsheet_away: Teamsheet = None, ) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]: """Parses files for a single match from the StatsBomb open dataset and extracts the event data and teamsheets. This function provides high-level access to an events json file from the openly published StatsBomb open data and returns Event- and Teamsheet-objects for both teams for the full match. A StatsBomb360 json file can be passed to the function to include `StatsBomb360 data <https://statsbomb.com/articles/soccer/ statsbomb-360-freeze-frame-viewer-a-new-release-in-statsbomb-iq/>`_ to the ``qualifier`` column. Requires the parsed files from the dataset to maintain their original names from the `official data repository <https://github.com/statsbomb/open-data>`_ Parameters ---------- filepath_events: str or pathlib.Path Full path to json file where the Event data is saved. filepath_match: str or pathlib.Path Full path to json file where information about all matches of a season are stored. filepath_threesixty: str or pathlib.Path, optional Full path to json file where the StatsBomb360 data in is saved if available. The information about the area of the field where player positions are tracked (``visible_area``) and player positions at single events (``freeze frame``) are stored as a string in the ``qualifier`` column. teamsheet_home: Teamsheet, optional Teamsheet-object for the home team used to create link dictionaries of the form `links[pID] = team`. If given as None (default), teamsheet is extracted from the events and match json files. teamsheet_away: Teamsheet, optional Teamsheet-object for the away team. If given as None (default), teamsheet is extracted from the events and match json files. See teamsheet_home for details. Returns ------- data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]] Tuple of (nested) floodlight core objects with shape (events_objects, teamsheets). ``events_objects`` is a nested dictionary containing ``Events`` objects for each team and segment of the form ``events_objects[segment][team] = Events``. For a typical league match with two halves and teams this dictionary looks like: ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away': Events}}``. ``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team of the form ``teamsheets[team] = Teamsheet``. Notes ----- StatsBomb's open format of handling provides certain additional event attributes, which attach additional information to certain events. As of now, these information are parsed as a string in the ``qualifier`` column of the returned DataFrame and can be transformed to a dict of form ``{attribute: value}``. This includes the information about the tracked position of (some) players and the visible area that is included in the StatsBomb360 data. """ # load json files into memory with open(str(filepath_events), "r", encoding="utf8") as f: file_event_list = json.load(f) if filepath_threesixty is not None: with open(str(filepath_threesixty), "r", encoding="utf8") as f: file_threesixty_list = json.load(f) else: file_threesixty_list = None # create or check teamsheet objects if teamsheet_home is None and teamsheet_away is None: teamsheets = read_teamsheets_from_open_event_data_json( filepath_events, filepath_match ) teamsheet_home = teamsheets["Home"] teamsheet_away = teamsheets["Away"] elif teamsheet_home is None: teamsheets = read_teamsheets_from_open_event_data_json( filepath_events, filepath_match ) teamsheet_home = teamsheets["Home"] elif teamsheet_away is None: teamsheets = read_teamsheets_from_open_event_data_json( filepath_events, filepath_match ) teamsheet_away = teamsheets["Away"] else: pass # potential check # create links links_tID_to_team = { teamsheet_home.teamsheet.at[0, "tID"]: "Home", teamsheet_away.teamsheet.at[0, "tID"]: "Away", } periods = set([event["period"] for event in file_event_list]) segments = [f"HT{period}" for period in periods] mID = int(filepath_events.split(os.path.sep)[-1][:-5]) # from filepath # initialize event bins columns = [ "eID", "gameclock", "pID", "tID", "mID", "outcome", "timestamp", "minute", "second", "at_x", "at_y", "to_x", "to_y", "event_name", "player_name", "team_name", "qualifier", ] team_event_lists = { team: {segment: {col: [] for col in columns} for segment in segments} for team in links_tID_to_team.values() } # parse events loop for event in file_event_list: # get team and segment information period = event["period"] segment = "HT" + str(period) team = links_tID_to_team[event["possession_team"]["id"]] # identifier and outcome: eID = event["type"]["id"] tID = event["team"]["id"] pID = event["player"]["id"] if "player" in event else None event_name = event["type"]["name"] team_name = event["team"]["name"] player_name = event["player"]["name"] if "player" in event else None outcome = None if "type" in event and event["type"]["name"].lower() in event: outcome_name = ( event[event["type"]["name"].lower()]["outcome"]["name"] if "outcome" in event[event["type"]["name"].lower()] else "None" ) if outcome_name in ["Goal", "Won", "Complete", "Success In Play"]: outcome = 1 elif outcome_name in [ "Incomplete", "Lost In Play", "Saved Off Target", "Off T", "Blocked", ]: outcome = 0 team_event_lists[team][segment]["mID"].append(mID) team_event_lists[team][segment]["eID"].append(eID) team_event_lists[team][segment]["tID"].append(tID) team_event_lists[team][segment]["pID"].append(pID) team_event_lists[team][segment]["event_name"].append(event_name) team_event_lists[team][segment]["team_name"].append(team_name) team_event_lists[team][segment]["player_name"].append(player_name) team_event_lists[team][segment]["outcome"].append(outcome) # relative time timestamp = event["timestamp"] minute = event["minute"] second = event["second"] millisecond = int(timestamp.split(".")[1]) gameclock = 60 * minute + second + millisecond * 0.001 team_event_lists[team][segment]["timestamp"].append(timestamp) team_event_lists[team][segment]["minute"].append(minute) team_event_lists[team][segment]["second"].append(second) team_event_lists[team][segment]["gameclock"].append(gameclock) # location at_x = event["location"][0] if "location" in event else None at_y = event["location"][1] if "location" in event else None if "type" in event and event["type"]["name"].lower() in event: to_x = ( event[event["type"]["name"].lower()]["end_location"][0] if "end_location" in event[event["type"]["name"].lower()] else None ) to_y = ( event[event["type"]["name"].lower()]["end_location"][1] if "end_location" in event[event["type"]["name"].lower()] else None ) else: to_x = None to_y = None team_event_lists[team][segment]["at_x"].append(at_x) team_event_lists[team][segment]["at_y"].append(at_y) team_event_lists[team][segment]["to_x"].append(to_x) team_event_lists[team][segment]["to_y"].append(to_y) # qualifier qual_dict = {} qual_dict["unique_identifier"] = event["id"] for qualifier in event: if qualifier in [ "team", "player", "period", "timestamp", "minute", "second", "location", "id", "type", ]: continue qual_value = event[qualifier] qual_dict[qualifier] = qual_value if file_threesixty_list is not None: threesixty_event = [ event for event in file_threesixty_list if event["event_uuid"] == qual_dict["unique_identifier"] ] if len(threesixty_event) == 1: qual_dict["360_freeze_frame"] = threesixty_event[0]["freeze_frame"] qual_dict["360_visible_area"] = threesixty_event[0]["visible_area"] elif len(threesixty_event) >= 1: warnings.warn( f"Found ambiguous StatsBomb event ID " f"{qual_dict['unique_identifier']} matching to more than one " f"StatsBomb360 event." ) team_event_lists[team][segment]["qualifier"].append(str(qual_dict)) # create objects events_objects = {} for segment in segments: events_objects[segment] = {} for team in ["Home", "Away"]: events_objects[segment][team] = Events( events=pd.DataFrame(data=team_event_lists[team][segment]), ) teamsheets = { "Home": teamsheet_home, "Away": teamsheet_away, } # pack objects data_objects = (events_objects, teamsheets) return data_objects