import warnings
from pathlib import Path
from typing import Tuple, Union, Dict
import os
import json
import pandas as pd
from floodlight.core.events import Events
from floodlight.core.teamsheet import Teamsheet
[docs]
def read_teamsheets_from_open_event_data_json(
filepath_events: Union[str, Path],
filepath_match: Union[str, Path],
) -> Dict[str, Teamsheet]:
"""Reads open events and match files and returns Teamsheet objects for the home
and the away team.
Parameters
----------
filepath_events: str or pathlib.Path
Full path to json file where the Event data is saved.
filepath_match: str or pathlib.Path
Full path to json file where information about all matches of a season are
stored.
Returns
-------
teamsheets: Dict[str, Teamsheet]
Dictionary with teamsheets for the home team and the away team.
"""
# load json file into memory
with open(str(filepath_match), "r", encoding="utf8") as f:
matchinfo_list = json.load(f)
with open(str(filepath_events), "r", encoding="utf8") as f:
file_event_list = json.load(f)
# retrieve match info from file
mID = int(str(filepath_events).split(os.path.sep)[-1][:-5]) # from filepath
matchinfo = None
for info in matchinfo_list:
if info["match_id"] == mID:
matchinfo = info
break
# raise error if match is not contained in matchinfo
if matchinfo is None:
raise KeyError(
f"The match with mID {mID} was not found in the specified "
f"File of match information ({filepath_match})."
)
# initialize teamsheets
teamsheets = {
"Home": pd.DataFrame(
columns=["player", "position", "team_name", "jID", "pID", "tID"]
),
"Away": pd.DataFrame(
columns=["player", "position", "team_name", "jID", "pID", "tID"]
),
}
# find team data in match info
tIDs = {
"Home": matchinfo["home_team"]["home_team_id"],
"Away": matchinfo["away_team"]["away_team_id"],
}
team_names = {
"Home": matchinfo["home_team"]["home_team_name"],
"Away": matchinfo["away_team"]["away_team_name"],
}
# parse starting eleven
for event in file_event_list:
if event["type"]["name"] != "Starting XI":
continue
# find team
if event["team"]["id"] == tIDs["Home"]:
team = "Home"
elif event["team"]["id"] == tIDs["Away"]:
team = "Away"
else:
team = None
# find list of players
players = event["tactics"]["lineup"]
# add player data to teamsheets
teamsheets[team]["player"] = [player["player"]["name"] for player in players]
teamsheets[team]["pID"] = [player["player"]["id"] for player in players]
teamsheets[team]["jID"] = [player["jersey_number"] for player in players]
teamsheets[team]["position"] = [
player["position"]["name"] for player in players
]
teamsheets[team]["tID"] = tIDs[team]
teamsheets[team]["team_name"] = team_names[team]
# parse players coming in from substitutions
for event in file_event_list:
if event["type"]["name"] != "Substitution":
continue
# find team
if event["team"]["id"] == tIDs["Home"]:
team = "Home"
elif event["team"]["id"] == tIDs["Away"]:
team = "Away"
else:
team = None
# append player data to teamsheet
player_data = pd.DataFrame(
{
"player": [event["substitution"]["replacement"]["name"]],
"pID": [event["substitution"]["replacement"]["id"]],
"jID": [pd.NA], # unfortunately not included in substitution event
"position": [event["position"]["name"]],
"tID": [tIDs[team]],
"team_name": [team_names[team]],
}
)
teamsheets[team] = pd.concat((teamsheets[team], player_data), ignore_index=True)
# create teamsheet objects
for team in team_names.keys():
teamsheets[team] = Teamsheet(teamsheets[team])
return teamsheets
[docs]
def read_open_event_data_json(
filepath_events: Union[str, Path],
filepath_match: Union[str, Path],
filepath_threesixty: Union[str, Path] = None,
teamsheet_home: Teamsheet = None,
teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]:
"""Parses files for a single match from the StatsBomb open dataset and extracts the
event data and teamsheets.
This function provides high-level access to an events json file from the openly
published StatsBomb open data and returns Event- and Teamsheet-objects for both
teams for the full match. A StatsBomb360 json file can be passed to the function to
include `StatsBomb360 data <https://statsbomb.com/articles/soccer/
statsbomb-360-freeze-frame-viewer-a-new-release-in-statsbomb-iq/>`_ to the
``qualifier`` column. Requires the parsed files from the dataset to maintain their
original names from the `official data repository
<https://github.com/statsbomb/open-data>`_
Parameters
----------
filepath_events: str or pathlib.Path
Full path to json file where the Event data is saved.
filepath_match: str or pathlib.Path
Full path to json file where information about all matches of a season are
stored.
filepath_threesixty: str or pathlib.Path, optional
Full path to json file where the StatsBomb360 data is saved if available. The
information about the area of the field where player positions are tracked
(``visible_area``) and player positions at single events (``freeze frame``) are
stored as a string in the ``qualifier`` column.
teamsheet_home: Teamsheet, optional
Teamsheet-object for the home team used to create link dictionaries of the form
`links[pID] = team`. If given as None (default), teamsheet is extracted from
the events and match json files.
teamsheet_away: Teamsheet, optional
Teamsheet-object for the away team. If given as None (default), teamsheet is
extracted from the events and match json files. See teamsheet_home for details.
Returns
-------
data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]
Tuple of (nested) floodlight core objects with shape (events_objects,
teamsheets).
``events_objects`` is a nested dictionary containing ``Events`` objects for
each team and segment of the form ``events_objects[segment][team] = Events``.
For a typical league match with two halves and teams this dictionary looks like:
``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
Events}}``.
``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
of the form ``teamsheets[team] = Teamsheet``.
Notes
-----
StatsBomb's open format of handling provides certain additional event attributes,
which attach additional information to certain events. As of now, these information
are parsed as a string in the ``qualifier`` column of the returned DataFrame and can
be transformed to a dict of form ``{attribute: value}``. This includes the
information about the tracked position of (some) players and the visible area
that is included in the StatsBomb360 data.
"""
# load json files into memory
with open(str(filepath_events), "r", encoding="utf8") as f:
file_event_list = json.load(f)
if filepath_threesixty is not None:
with open(str(filepath_threesixty), "r", encoding="utf8") as f:
file_threesixty_list = json.load(f)
else:
file_threesixty_list = None
# create or check teamsheet objects
if teamsheet_home is None and teamsheet_away is None:
teamsheets = read_teamsheets_from_open_event_data_json(
filepath_events, filepath_match
)
teamsheet_home = teamsheets["Home"]
teamsheet_away = teamsheets["Away"]
elif teamsheet_home is None:
teamsheets = read_teamsheets_from_open_event_data_json(
filepath_events, filepath_match
)
teamsheet_home = teamsheets["Home"]
elif teamsheet_away is None:
teamsheets = read_teamsheets_from_open_event_data_json(
filepath_events, filepath_match
)
teamsheet_away = teamsheets["Away"]
else:
pass
# potential check
# create links
links_tID_to_team = {
teamsheet_home.teamsheet.at[0, "tID"]: "Home",
teamsheet_away.teamsheet.at[0, "tID"]: "Away",
}
periods = set([event["period"] for event in file_event_list])
segments = [f"HT{period}" for period in periods]
mID = int(filepath_events.split(os.path.sep)[-1][:-5]) # from filepath
# initialize event bins
columns = [
"eID",
"gameclock",
"pID",
"tID",
"mID",
"outcome",
"timestamp",
"minute",
"second",
"at_x",
"at_y",
"to_x",
"to_y",
"event_name",
"player_name",
"team_name",
"qualifier",
]
team_event_lists = {
team: {segment: {col: [] for col in columns} for segment in segments}
for team in links_tID_to_team.values()
}
# parse events loop
for event in file_event_list:
# get team and segment information
period = event["period"]
segment = "HT" + str(period)
team = links_tID_to_team[event["possession_team"]["id"]]
# identifier and outcome:
eID = event["type"]["id"]
tID = event["team"]["id"]
pID = event["player"]["id"] if "player" in event else None
event_name = event["type"]["name"]
team_name = event["team"]["name"]
player_name = event["player"]["name"] if "player" in event else None
outcome = None
if "type" in event and event["type"]["name"].lower() in event:
outcome_name = (
event[event["type"]["name"].lower()]["outcome"]["name"]
if "outcome" in event[event["type"]["name"].lower()]
else "None"
)
if outcome_name in ["Goal", "Won", "Complete", "Success In Play"]:
outcome = 1
elif outcome_name in [
"Incomplete",
"Lost In Play",
"Saved Off Target",
"Off T",
"Blocked",
]:
outcome = 0
team_event_lists[team][segment]["mID"].append(mID)
team_event_lists[team][segment]["eID"].append(eID)
team_event_lists[team][segment]["tID"].append(tID)
team_event_lists[team][segment]["pID"].append(pID)
team_event_lists[team][segment]["event_name"].append(event_name)
team_event_lists[team][segment]["team_name"].append(team_name)
team_event_lists[team][segment]["player_name"].append(player_name)
team_event_lists[team][segment]["outcome"].append(outcome)
# relative time
timestamp = event["timestamp"]
minute = event["minute"]
second = event["second"]
millisecond = int(timestamp.split(".")[1])
gameclock = 60 * minute + second + millisecond * 0.001
team_event_lists[team][segment]["timestamp"].append(timestamp)
team_event_lists[team][segment]["minute"].append(minute)
team_event_lists[team][segment]["second"].append(second)
team_event_lists[team][segment]["gameclock"].append(gameclock)
# location
at_x = event["location"][0] if "location" in event else None
at_y = event["location"][1] if "location" in event else None
if "type" in event and event["type"]["name"].lower() in event:
to_x = (
event[event["type"]["name"].lower()]["end_location"][0]
if "end_location" in event[event["type"]["name"].lower()]
else None
)
to_y = (
event[event["type"]["name"].lower()]["end_location"][1]
if "end_location" in event[event["type"]["name"].lower()]
else None
)
else:
to_x = None
to_y = None
team_event_lists[team][segment]["at_x"].append(at_x)
team_event_lists[team][segment]["at_y"].append(at_y)
team_event_lists[team][segment]["to_x"].append(to_x)
team_event_lists[team][segment]["to_y"].append(to_y)
# qualifier
qual_dict = {}
qual_dict["unique_identifier"] = event["id"]
for qualifier in event:
if qualifier in [
"team",
"player",
"period",
"timestamp",
"minute",
"second",
"location",
"id",
"type",
]:
continue
qual_value = event[qualifier]
qual_dict[qualifier] = qual_value
if file_threesixty_list is not None:
threesixty_event = [
event
for event in file_threesixty_list
if event["event_uuid"] == qual_dict["unique_identifier"]
]
if len(threesixty_event) == 1:
qual_dict["360_freeze_frame"] = threesixty_event[0]["freeze_frame"]
qual_dict["360_visible_area"] = threesixty_event[0]["visible_area"]
elif len(threesixty_event) >= 1:
warnings.warn(
f"Found ambiguous StatsBomb event ID "
f"{qual_dict['unique_identifier']} matching to more than one "
f"StatsBomb360 event."
)
team_event_lists[team][segment]["qualifier"].append(str(qual_dict))
# create objects
events_objects = {}
for segment in segments:
events_objects[segment] = {}
for team in ["Home", "Away"]:
events_objects[segment][team] = Events(
events=pd.DataFrame(data=team_event_lists[team][segment]),
)
teamsheets = {
"Home": teamsheet_home,
"Away": teamsheet_away,
}
# pack objects
data_objects = (events_objects, teamsheets)
return data_objects