import os.path
import warnings
from typing import Dict, Tuple, Union
from pathlib import Path
import numpy as np
import pandas as pd
from lxml import etree
from floodlight.io.utils import download_from_url, get_and_convert
from floodlight.core.code import Code
from floodlight.core.events import Events
from floodlight.core.pitch import Pitch
from floodlight.core.teamsheet import Teamsheet
from floodlight.core.xy import XY
from floodlight.settings import DATA_DIR
# ----------------------------- StatsPerform Open Format -------------------------------
def _create_metadata_from_open_csv_df(
csv_df: pd.DataFrame,
) -> Tuple[Dict[int, tuple], Pitch]:
"""Creates meta information from a pd.DataFrame that results from parsing the open
StatsPerform event data CSV file.
Parameters
----------
csv_df: pd.DataFrame
Data Frame with the parsed event data CSV file.
Returns
-------
periods: Dict[int, int]
Dictionary with start and endframes:
``periods[segment] = (startframe, endframe)``.
pitch: Pitch
Playing Pitch object.
"""
# create pitch
pi_len = csv_df["pitch_dimension_long_side"].values[0]
pi_wid = csv_df["pitch_dimension_short_side"].values[0]
pitch = Pitch.from_template(
"statsperform_open",
length=pi_len,
width=pi_wid,
sport="football",
)
# create periods for segments, coded as jumps in the frame sequence
periods = {}
frame_values = csv_df["frame_count"].unique()
seg_idx = np.where(np.diff(frame_values, prepend=frame_values[0]) > 1)
seg_idx = np.insert(seg_idx, 0, 0)
seg_idx = np.append(seg_idx, len(frame_values))
for segment in range(len(seg_idx) - 1):
start = int(frame_values[seg_idx[segment]])
end = int(frame_values[seg_idx[segment + 1] - 1])
periods[segment] = (start, end)
return periods, pitch
def _read_open_event_csv_single_line(
line: str,
) -> Tuple[Dict, str, str]:
"""Extracts all relevant information from a single line of StatsPerform's Event csv
file (i.e. one single event in the data).
Parameters
----------
line: str
One full line from StatsPerform's Event CSV file.
Returns
-------
event: Dict
Dictionary with relevant event information in the form:
``event[attribute] = value``.
"""
event = {}
attrib = line.split(sep=",")
# description
event["eID"] = attrib[5].replace(" ", "")
# relative time
event["gameclock"] = float(attrib[4])
event["frameclock"] = float(attrib[2])
# segment, player and team
segment = attrib[3]
team = attrib[9]
event["tID"] = team
event["pID"] = attrib[8]
# outcome
event["outcome"] = np.nan
if "Won" in attrib[5].split(" "):
event["outcome"] = 1
elif "Lost" in attrib[5].split(" "):
event["outcome"] = 0
# minute and second of game
event["minute"] = np.floor(event["gameclock"] / 60)
event["second"] = np.floor(event["gameclock"] - event["minute"] * 60)
# additional information (qualifier)
event["qualifier"] = {
"event_id": attrib[1],
"event_type_id": attrib[6],
"sequencenumber": attrib[7],
"jersey_no": attrib[10],
"is_pass": attrib[11],
"is_cross": attrib[12],
"is_corner": attrib[13],
"is_free_kick": attrib[14],
"is_goal_kick": attrib[15],
"passtypeid": attrib[16],
"wintypeid": attrib[17],
"savetypeid": attrib[18],
"possessionnumber": attrib[19],
}
return event, team, segment
[docs]def read_teamsheets_from_open_data_csv(
filepath_csv: Union[str, Path]
) -> Dict[str, Teamsheet]:
"""Parses the entire open StatsPerform position data CSV file for unique jIDs
(jerseynumbers) and creates teamsheets for both teams.
Parameters
----------
filepath_csv: str or pathlib.Path
CSV file containing either open position or open event data.
Returns
-------
teamsheets: Dict[str, Teamsheet]
Dictionary with teamsheets for the home team and the away team.
Notes
-----
Statsperform open data does not contain any player names. Thus, the teamsheet
objects generated by this method will name players 'Player i' with i starting at 1.
To identify players, use the jersey numbers of players or provide custom teamsheets
generated by a different parser if Statsperform open data is used in combination
with other data providers.
"""
# read dat-file into pd.DataFrame
csv_df = pd.read_csv(str(filepath_csv))
# initialize team and ball ids
team_ids = {"Home": 1.0, "Away": 2.0}
ball_id = 4
# check for additional tIDs
for tID in csv_df["team_id"].unique():
if not (tID in team_ids.values() or tID == ball_id or np.isnan(tID)):
warnings.warn(
f"tID {tID} did not match any of the standard tIDs "
f"({team_ids.values}) or the ball ID ({ball_id})!"
)
# initialize teamsheets
teamsheets = {
"Home": pd.DataFrame(columns=["player", "jID", "pID", "tID"]),
"Away": pd.DataFrame(columns=["player", "jID", "pID", "tID"]),
}
# loop over teams
for team in team_ids:
# extract list with pID and jID information for all players in the team
team_id = team_ids[team]
team_df = csv_df[csv_df["team_id"] == team_id]
jIDs = team_df["jersey_no"].unique()
pIDs = [
team_df[team_df["jersey_no"] == jID]["player_id"].unique() for jID in jIDs
]
# possible check for multiple pIDs assigned to a single jID
# insert data to teamsheet
teamsheets[team]["player"] = [f"Player {i}" for i in range(len(pIDs))]
teamsheets[team]["jID"] = [jID for jID in jIDs]
teamsheets[team]["pID"] = [pID[0] for pID in pIDs]
teamsheets[team]["tID"] = team_id
# create teamsheet objects
for team in teamsheets:
teamsheets[team] = Teamsheet(teamsheets[team])
return teamsheets
[docs]def read_open_event_data_csv(
filepath_events: Union[str, Path],
teamsheet_home: Teamsheet = None,
teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]:
"""Parses an open StatsPerform Match Event CSV file and extracts the event data and
teamsheets.
This function provides high-level access to the particular openly published
StatsPerform match events CSV file (e.g. for the Pro Forum '22) and returns Event
objects for both teams.
Parameters
----------
filepath_events: str or pathlib.Path
Full path to xml File where the Event data in StatsPerform csv format is
saved
teamsheet_home: Teamsheet, optional
Teamsheet-object for the home team. If given as None (default), teamsheet is
extracted from the event data CSV file.
teamsheet_away: Teamsheet, optional
Teamsheet-object for the away team. If given as None (default), teamsheet is
extracted from the event data CSV file.
Returns
-------
data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet]]
Tuple of (nested) floodlight core objects with shape (events_objects,
teamsheets).
``events_objects`` is a nested dictionary containing ``Events`` objects for
each team and segment of the form ``events_objects[segment][team] = Events``.
For a typical league match with two halves and teams this dictionary looks like:
``{'1': {'Home': Events, 'Away': Events}, '2': {'Home': Events, 'Away': Events}
}``.
``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
of the form ``teamsheets[team] = Teamsheet``.
Notes
-----
StatsPerform's open format of handling provides certain additional event attributes,
which attach additional information to certain events. As of now, these information
are parsed as a string in the ``qualifier`` column of the returned DataFrame and can
be transformed to a dict of form ``{attribute: value}``.
"""
# initialize bin and variables
events = {}
team_ids = {"Home": 1.0, "Away": 2.0}
segments = ["1", "2"]
for team in team_ids.values():
events[team] = {segment: pd.DataFrame() for segment in segments}
# create or check teamsheet objects
if teamsheet_home is None and teamsheet_away is None:
teamsheets = read_teamsheets_from_open_data_csv(filepath_events)
teamsheet_home = teamsheets["Home"]
teamsheet_away = teamsheets["Away"]
elif teamsheet_home is None:
teamsheets = read_teamsheets_from_open_data_csv(filepath_events)
teamsheet_home = teamsheets["Home"]
elif teamsheet_away is None:
teamsheets = read_teamsheets_from_open_data_csv(filepath_events)
teamsheet_away = teamsheets["Away"]
else:
pass
# potential check
# parse event data
with open(str(filepath_events), "r") as f:
while True:
line = f.readline()
# terminate if at end of file
if len(line) == 0:
break
# skip the head
if line.split(sep=",")[3] == "current_phase":
continue
# read single line
event, team, segment = _read_open_event_csv_single_line(line)
# insert to bin
if team:
team = float(team)
events[team][segment] = events[team][segment].append(
event, ignore_index=True
)
else: # if no clear assignment possible, insert to bins for both teams
for team in team_ids.values():
events[team][segment] = events[team][segment].append(
event, ignore_index=True
)
# create objects
events_objects = {}
for segment in segments:
events_objects[segment] = {}
for team in ["Home", "Away"]:
events_objects[segment][team] = Events(
events=pd.DataFrame(data=events[team_ids[team]][segment]),
)
teamsheets = {
"Home": teamsheet_home,
"Away": teamsheet_away,
}
# pack objects
data_objects = (events_objects, teamsheets)
return data_objects
[docs]def read_open_position_data_csv(
filepath_position: Union[str, Path],
teamsheet_home: Teamsheet = None,
teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Code], Dict[str, Teamsheet], Pitch]:
"""Parses an open StatsPerform CSV file and extract position data and possession
codes as well as teamsheets and pitch information.
Openly published StatsPerform position data (e.g. for the Pro Forum '22) is stored
in a CSV file containing all position data (for both halves) as well as information
about players, the pitch, and the ball possession. This function provides high-level
access to StatsPerform data by parsing the CSV file.
Parameters
----------
filepath_position: str or pathlib.Path
Full path to the CSV file.
teamsheet_home: Teamsheet, optional
Teamsheet-object for the home team used to create link dictionaries of the form
`links[team][jID] = xID`. The links are used to map players to a specific xID in
the respective XY objects. Should be supplied for custom ordering. If given as
None (default), teamsheet is extracted from the open StatsPerform CSV file and
its xIDs are assigned in order of appearance.
teamsheet_away: Teamsheet, optional
Teamsheet-object for the away team. If given as None (default), teamsheet is
extracted from the Match Information XML file. See teamsheet_home for details.
Returns
-------
data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Code], \
Dict[str, Teamsheet], Pitch]
Tuple of (nested) floodlight core objects with shape (xy_objects,
possession_objects, teamsheets, pitch).
``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
and segment of the form ``xy_objects[segment][team] = XY``. For a typical
league match with two halves and teams this dictionary looks like:
``{0: {'Home': XY, 'Away': XY}, 1: {'Home': XY, 'Away': XY}}``.
``possession_objects`` is a dictionary containing ``Code`` objects with
possession information (home or away) for each segment of the form
``possession_objects[segment] = Code``.
``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
of the form ``teamsheets[team] = Teamsheet``.
``pitch`` is a ``Pitch`` object corresponding to the data.
"""
# parse the CSV file into pd.DataFrame
dat_df = pd.read_csv(str(filepath_position))
# initialize team and ball ids
team_ids = {"Home": 1.0, "Away": 2.0}
ball_id = 4
# check for additional tIDs
for ID in dat_df["team_id"].unique():
if not (ID in team_ids.values() or ID == ball_id):
warnings.warn(f"Team ID {ID} did not match any of the standard IDs!")
# create or check teamsheet objects
if teamsheet_home is None and teamsheet_away is None:
teamsheets = read_teamsheets_from_open_data_csv(filepath_position)
teamsheet_home = teamsheets["Home"]
teamsheet_away = teamsheets["Away"]
elif teamsheet_home is None:
teamsheets = read_teamsheets_from_open_data_csv(filepath_position)
teamsheet_home = teamsheets["Home"]
elif teamsheet_away is None:
teamsheets = read_teamsheets_from_open_data_csv(filepath_position)
teamsheet_away = teamsheets["Away"]
else:
pass
# potential check
# create links
if "xID" not in teamsheet_home.teamsheet.columns:
teamsheet_home.add_xIDs()
if "xID" not in teamsheet_away.teamsheet.columns:
teamsheet_away.add_xIDs()
links_jID_to_xID = {}
links_jID_to_xID["Home"] = teamsheet_home.get_links("jID", "xID")
links_jID_to_xID["Away"] = teamsheet_away.get_links("jID", "xID")
# create periods and pitch
periods, pitch = _create_metadata_from_open_csv_df(dat_df)
segments = list(periods.keys())
# infer data shapes
number_of_players = {team: len(links_jID_to_xID[team]) for team in links_jID_to_xID}
number_of_frames = {}
for segment in segments:
start = periods[segment][0]
end = periods[segment][1]
number_of_frames[segment] = end - start + 1
# bins
codes = {"possession": {segment: [] for segment in segments}}
xydata = {
"Home": {
segment: np.full(
[
number_of_frames[segment],
number_of_players[list(links_jID_to_xID.keys())[0]] * 2,
],
np.nan,
)
for segment in periods
},
"Away": {
segment: np.full(
[
number_of_frames[segment],
number_of_players[list(links_jID_to_xID.keys())[1]] * 2,
],
np.nan,
)
for segment in periods
},
"Ball": {
segment: np.full([number_of_frames[segment], 2], np.nan)
for segment in periods
},
}
# loop
for segment in segments:
# teams
for team in team_ids:
team_df = dat_df[dat_df["team_id"] == team_ids[team]]
for pID in team_df["player_id"].unique():
# extract player information
pl_df = team_df[team_df["player_id"] == pID]
frames = pl_df["frame_count"].values
x_position = pl_df["pos_x"].values
y_position = pl_df["pos_y"].values
# compute appearance of player in segment
appearance = np.array(
[
(periods[segment][0] <= frame <= periods[segment][-1])
for frame in frames
]
)
# check for players that did not play in segment
if not np.sum(appearance):
continue
# insert player position to bin array
jrsy = int(pl_df["jersey_no"].values[0])
x_col = (links_jID_to_xID[team][jrsy] - 1) * 2
y_col = (links_jID_to_xID[team][jrsy] - 1) * 2 + 1
start = frames[appearance][0] - periods[segment][0]
end = frames[appearance][-1] - periods[segment][0] + 1
xydata[team][segment][start:end, x_col] = x_position[appearance]
xydata[team][segment][start:end, y_col] = y_position[appearance]
# ball
ball_df = dat_df[dat_df["team_id"] == 4]
frames = ball_df["frame_count"].values
appearance = np.array(
[(periods[segment][0] <= frame <= periods[segment][-1]) for frame in frames]
)
xydata["Ball"][segment][:, 0] = ball_df["pos_x"].values[appearance]
xydata["Ball"][segment][:, 1] = ball_df["pos_x"].values[appearance]
# update codes
codes["possession"][segment] = ball_df["possession"].values[appearance]
# create objects
xy_objects = {}
possession_objects = {}
for segment in segments:
xy_objects[segment] = {}
possession_objects[segment] = Code(
code=codes["possession"][segment],
name="possession",
definitions=dict([(team_id, team) for team, team_id in team_ids.items()]),
framerate=10,
)
for team in ["Home", "Away", "Ball"]:
xy_objects[segment][team] = XY(
xy=xydata[team][segment],
framerate=10,
)
teamsheets = {
"Home": teamsheet_home,
"Away": teamsheet_away,
}
# pack objects
data_objects = (
xy_objects,
possession_objects,
teamsheets,
pitch,
)
return data_objects
# ----------------------------- StatsPerform Format ---------------------------
def _read_position_data_txt_single_line(
line: str,
) -> Tuple[
int,
int,
Dict[str, Dict[str, Tuple[float, float, float]]],
Dict[str, Union[str, tuple]],
]:
"""Extracts all relevant information from a single line of StatsPerform's position
data TXT file (i.e. one frame of data).
Parameters
----------
line: str
One full line from StatsPerform's .txt-file, equals one sample of data.
Returns
-------
gameclock: int
The gameclock of the current segment in milliseconds.
segment: int
The segment identifier.
positions: Dict[str, Dict[str, Tuple[float, float, float]]]
Nested dictionary that stores player position information for each team and
player. Has the form ``positions[team][jID] = (x, y)``.
ball: Dict[str]
Dictionary with ball information. Has keys 'position', 'possession' and
'ballstatus'.
"""
# bins
positions = {"Home": {}, "Away": {}, "Other": {}}
ball = {}
# read chunks
chunks = line.split(":")
time_chunk = chunks[0]
player_chunks = chunks[1].split(";")
ball_chunk = None
if len(chunks) > 2: # check if ball information exist in chunk
ball_chunk = chunks[2]
# time chunk
# systemclock = time_chunk.split(";")[0]
# possible check or synchronization step
timeinfo = time_chunk.split(";")[1].split(",")
gameclock = int(timeinfo[0])
segment = int(timeinfo[1])
# ballstatus = timeinfo[2].split(":")[0] == '0' # '0' seems to be always the case?
# player chunks
for player_chunk in player_chunks:
# skip final entry of chunk
if not player_chunk or player_chunk == "\n":
continue
# read team
chunk_data = player_chunk.split(",")
if chunk_data[0] in ["0", "3"]:
team = "Home"
elif chunk_data[0] in ["1", "4"]:
team = "Away"
else:
team = "Other"
# read IDs
# pID = chunk_data[1]
jID = chunk_data[2]
# read positions
x, y = map(lambda x: float(x), chunk_data[3:])
# assign
positions[team][jID] = (x, y)
# ball chunk
if ball_chunk is not None:
x, y, z = map(lambda x: float(x), ball_chunk.split(";")[0].split(","))
# ball["position"] = (x, y, z) # z-coordinate is not yet supported
ball["position"] = (x, y)
return gameclock, segment, positions, ball
def _read_time_information_from_position_data_txt(
filepath_position: Union[str, Path],
) -> Tuple[Dict, Union[int, None]]:
"""Reads StatsPerform's position TXT file and extracts information about the first
and last frame of periods. Also, a framerate is estimated from the
gameclock difference between samples.
Parameters
----------
filepath_position: str or pathlib.Path
Full path to the TXT file containing the position data.
Returns
-------
periods: Dict
Dictionary with start and endframes:
``periods[segment] = [startframe, endframe]``.
framerate_est: int or None
Estimated temporal resolution of data in frames per second/Hertz.
"""
# bins
startframes = {}
endframes = {}
framerate_est = None
# read TXT file from disk
file_txt = open(filepath_position, "r")
# loop
last_gameclock = None
last_segment = None
for line in file_txt.readlines():
# read gameclock and segment
gameclock, segment, _, _ = _read_position_data_txt_single_line(line)
# update periods
if segment not in startframes:
startframes[segment] = gameclock
if last_gameclock is not None:
endframes[last_segment] = last_gameclock
# estimate framerate if desired
if last_gameclock is not None:
delta = np.absolute(gameclock - last_gameclock) # in milliseconds
if framerate_est is None:
framerate_est = int(1000 / delta)
elif framerate_est != int(1000 / delta) and last_segment == segment:
warnings.warn(
f"Framerate estimation yielded diverging results."
f"The originally estimated framerate of {framerate_est} Hz did not "
f"match the current estimation of {int(1000 / delta)} Hz. This "
f"might be caused by missing frame(s) in the position data. "
f"Continuing by choosing the latest estimation of "
f"{int(1000 / delta)} Hz"
)
framerate_est = int(1000 / delta)
# update variables
last_gameclock = gameclock
last_segment = segment
# update end of final segment
endframes[last_segment] = last_gameclock
# assembly
periods = {
segment: (startframes[segment], endframes[segment]) for segment in startframes
}
# close file
file_txt.close()
return periods, framerate_est
def _read_jersey_numbers_from_position_data_txt(
file_location_txt: Union[str, Path],
) -> Tuple[set, set]:
"""Reads StatsPerform's position TXT file and extracts unique set of jIDs
(jerseynumbers) for both teams.
Parameters
----------
file_location_txt: str or pathlib.Path
Full path to the TXT file containing the position data.
Returns
-------
home_jIDs: set
away_jIDs: set
"""
# bins
home_jIDs = set()
away_jIDs = set()
# read TXT file from disk
file_txt = open(file_location_txt, "r")
# loop
for package in file_txt.readlines():
# read line
_, _, positions, _ = _read_position_data_txt_single_line(package)
# extract jersey numbers
home_jIDs |= set(positions["Home"].keys())
away_jIDs |= set(positions["Away"].keys())
# close file
file_txt.close()
return home_jIDs, away_jIDs
[docs]def read_teamsheets_from_event_data_xml(
filepath_events: Union[str, Path],
) -> Dict[str, Teamsheet]:
"""Parses the StatsPerform event file and returns two Teamsheet-objects with
detailed player information for the home and the away team.
Parameters
----------
filepath_events: str or pathlib.Path
Full path to the XML file containing the event data.
Returns
-------
teamsheets: Dict[str, Teamsheet]
Dictionary with teamsheets for the home team and the away team.
"""
# load event data xml tree into memory
tree = etree.parse(str(filepath_events))
root = tree.getroot()
# initialize teamsheets
teamsheets = {
"Home": pd.DataFrame(
columns=["player", "position", "team_name", "jID", "pID", "tID", "started"]
),
"Away": pd.DataFrame(
columns=["player", "position", "team_name", "jID", "pID", "tID", "started"]
),
}
# parse player information
for team_matchsheet in root.findall("MatchSheet/Team"):
# skip referees
if team_matchsheet.attrib["Type"] == "Referees":
continue
# read team
team = team_matchsheet.attrib["Type"][:-4] # cut 'Team' of e.g. 'HomeTeam'
tID = team_matchsheet.attrib["IdTeam"]
team_name = team_matchsheet.attrib["Name"]
# find players
players = [
actor
for actor in team_matchsheet.findall("Actor")
if actor.attrib["Occupation"] == "Player"
]
# create teamsheet
teamsheets[team]["player"] = [
get_and_convert(player, "NickName", str) for player in players
]
teamsheets[team]["pID"] = [
get_and_convert(player, "IdActor", int) for player in players
]
teamsheets[team]["jID"] = [
get_and_convert(player, "JerseyNumber", int) for player in players
]
teamsheets[team]["position"] = [
get_and_convert(player, "Position", str) for player in players
]
teamsheets[team]["started"] = [
player.get("IsStarter") == "True" for player in players
]
teamsheets[team]["tID"] = tID
teamsheets[team]["team_name"] = team_name
# create teamsheet objects
for team in teamsheets:
teamsheets[team] = Teamsheet(teamsheets[team])
return teamsheets
[docs]def read_teamsheets_from_position_data_txt(
filepath_position: Union[str, Path],
) -> Dict[str, Teamsheet]:
"""Parses the StatsPerform position file and returns two simple Teamsheet-objects
containing only two columns "player" and "jID" for the home and the away team.
Parameters
----------
filepath_position: str or pathlib.Path
Full path to the TXT file containing the position data.
Returns
-------
teamsheets: Dict[str, Teamsheet]
Dictionary with teamsheets for the home team and the away team.
"""
# create list of jIDs
homejrsy, awayjrsy = _read_jersey_numbers_from_position_data_txt(filepath_position)
homejrsy = list(homejrsy)
awayjrsy = list(awayjrsy)
homejrsy.sort()
awayjrsy.sort()
jIDs = {
"Home": homejrsy,
"Away": awayjrsy,
}
# create teamsheets
teamsheets = {
"Home": pd.DataFrame(columns=["player", "jID"]),
"Away": pd.DataFrame(columns=["player", "jID"]),
}
for team in teamsheets:
teamsheets[team]["player"] = [f"Player {i}" for i in range(len(jIDs[team]))]
teamsheets[team]["jID"] = [int(jID) for jID in jIDs[team]]
# create teamsheet objects
for team in teamsheets:
teamsheets[team] = Teamsheet(teamsheets[team])
return teamsheets
[docs]def read_event_data_xml(
filepath_events: Union[str, Path],
teamsheet_home: Teamsheet = None,
teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]:
"""Parses a StatsPerform XML file and extracts event data and pitch information.
This function provides high-level access to the StatsPerform match events XML file
and returns Events objects for both teams and information about the pitch.
Parameters
----------
filepath_events: str or pathlib.Path
Full path to the XML file containing the event data.
teamsheet_home: Teamsheet, optional
Teamsheet-object for the home team used to create link dictionaries of the form
`links[pID] = team`. The links are used to map players to the home and away
teams. If given as None (default), teamsheet is extracted from the event data
XML file.
teamsheet_away: Teamsheet, optional
Teamsheet-object for the away team. If given as None (default), teamsheet is
extracted from the event data XML file. See teamsheet_home for details.
Returns
-------
data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]
Tuple of (nested) floodlight core objects with shape (events_objects,
teamsheets, pitch).
``events_objects`` is a nested dictionary containing ``Events`` objects for
each team and segment of the form ``events_objects[segment][team] = Events``.
For a typical league match with two halves and teams this dictionary looks like:
``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
Events}}``.
``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
of the form ``teamsheets[team] = Teamsheet``.
``pitch`` is a ``Pitch`` object corresponding to the data.
"""
# load xml tree into memory
tree = etree.parse(str(filepath_events))
root = tree.getroot()
# create bins, read segments, and assign teams
columns = [
"eID",
"gameclock",
"pID",
"minute",
"second",
"at_x",
"at_y",
"to_x",
"to_y",
"qualifier",
]
segments = [
f"HT{get_and_convert(period.attrib, 'IdHalf', str)}"
for period in root.findall("Events/EventsHalf")
]
teams = ["Home", "Away"]
# create or check teamsheet objects
if teamsheet_home is None and teamsheet_away is None:
teamsheets = read_teamsheets_from_event_data_xml(filepath_events)
teamsheet_home = teamsheets["Home"]
teamsheet_away = teamsheets["Away"]
elif teamsheet_home is None:
teamsheets = read_teamsheets_from_event_data_xml(filepath_events)
teamsheet_home = teamsheets["Home"]
elif teamsheet_away is None:
teamsheets = read_teamsheets_from_event_data_xml(filepath_events)
teamsheet_away = teamsheets["Away"]
else:
pass
# potential check
# create links between pIDs and team
links_pID_to_team = {}
links_pID_to_team.update({pID: "Home" for pID in teamsheet_home["pID"]})
links_pID_to_team.update({pID: "Away" for pID in teamsheet_away["pID"]})
# bins
event_lists = {
team: {segment: {col: [] for col in columns} for segment in segments}
for team in teams
}
# loop over events
for half in root.findall("Events/EventsHalf"):
# get segment information
period = get_and_convert(half.attrib, "IdHalf", str)
segment = "HT" + str(period)
for event in half.findall("Event"):
# read pID
pID = get_and_convert(event.attrib, "IdActor1", int)
# assign team
team = get_and_convert(links_pID_to_team, pID, str)
# create list of either a single team or both teams if no clear assignment
if team == "None":
teams_assigned = teams # add to both teams
else:
teams_assigned = [team] # only add to one team
# identifier
eID = get_and_convert(event.attrib, "EventName", str)
for team in teams_assigned:
event_lists[team][segment]["eID"].append(eID)
event_lists[team][segment]["pID"].append(pID)
# relative time
gameclock = get_and_convert(event.attrib, "Time", int) / 1000
minute = np.floor(gameclock / 60)
second = np.floor(gameclock - minute * 60)
for team in teams_assigned:
event_lists[team][segment]["gameclock"].append(gameclock)
event_lists[team][segment]["minute"].append(minute)
event_lists[team][segment]["second"].append(second)
# location
at_x = get_and_convert(event.attrib, "LocationX", float)
at_y = get_and_convert(event.attrib, "LocationY", float)
to_x = get_and_convert(event.attrib, "TargetX", float)
to_y = get_and_convert(event.attrib, "TargetY", float)
for team in teams_assigned:
event_lists[team][segment]["at_x"].append(at_x)
event_lists[team][segment]["at_y"].append(at_y)
event_lists[team][segment]["to_x"].append(to_x)
event_lists[team][segment]["to_y"].append(to_y)
# qualifier
qual_dict = {}
for qual_id in event.attrib:
qual_value = event.attrib.get(qual_id)
qual_dict[qual_id] = qual_value
for team in teams_assigned:
event_lists[team][segment]["qualifier"].append(str(qual_dict))
# create pitch
length = get_and_convert(root.attrib, "FieldLength", int) / 100
width = get_and_convert(root.attrib, "FieldWidth", int) / 100
pitch = Pitch.from_template(
"statsperform_event",
length=length,
width=width,
sport="football",
)
# create objects
events_objects = {}
for segment in segments:
events_objects[segment] = {}
for team in ["Home", "Away"]:
events_objects[segment][team] = Events(
events=pd.DataFrame(data=event_lists[team][segment]),
)
teamsheets = {
"Home": teamsheet_home,
"Away": teamsheet_away,
}
# pack objects
data_objects = (events_objects, teamsheets, pitch)
return data_objects
[docs]def read_position_data_txt(
filepath_position: Union[str, Path],
teamsheet_home: Teamsheet = None,
teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]:
"""Parses a StatsPerform TXT file and extracts position data and teamsheets.
Internal StatsPerform position data is stored as a TXT file containing all
position data (for both halves). This function provides high-level access to
StatsPerform data by parsing the TXT file. Since no information about framerate is
delivered in the data itself, it is estimated from time difference between
individual frames. Teamsheets are extracted from the event data, if filepath_events
is provided. Otherwise, minimal Teamsheet-objects are inferred from the position
data.
Parameters
----------
filepath_position: str or pathlib.Path
Full path to the TXT file containing the position data.
teamsheet_home: Teamsheet, optional
Teamsheet-object for the home team used to create link dictionaries of the form
`links[team][jID] = xID`. The links are used to map players to a specific xID in
the respective XY objects. Should be supplied for custom ordering. If given as
None (default), teamsheet is extracted from the position data TXT file and its
xIDs are assigned in order of appearance.
teamsheet_away: Teamsheet, optional
Teamsheet-object for the away team. If given as None (default), teamsheet is
extracted from the position data TXT file. See teamsheet_home for details.
Returns
-------
data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]
Tuple of (nested) floodlight core objects with shape (xy_objects,
teamsheets).
``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
and segment of the form ``xy_objects[segment][team] = XY``. For a typical
league match with two halves and teams this dictionary looks like:
``{1: {'Home': XY, 'Away': XY}, 2: {'Home': XY, 'Away': XY}}``.
``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
of the form ``teamsheets[team] = Teamsheet``.
Notes
-----
Statsperform position data does not contain any player information expect jersey
numbers by default. Thus, the teamsheet objects generated by this method will name
players 'Player i' with i starting at 1. To identify players, use the jersey numbers
of players or provide custom teamsheets (e.g. by parsing teamsheets from the
Statsperform event data or another data provider).
"""
# parse TXT file for periods and estimate framerate if not contained in filepath
periods, framerate_est = _read_time_information_from_position_data_txt(
filepath_position
)
segments = list(periods.keys())
# create or check teamsheet objects
if teamsheet_home is None and teamsheet_away is None:
teamsheets = read_teamsheets_from_position_data_txt(filepath_position)
teamsheet_home = teamsheets["Home"]
teamsheet_away = teamsheets["Away"]
elif teamsheet_home is None:
teamsheets = read_teamsheets_from_position_data_txt(filepath_position)
teamsheet_home = teamsheets["Home"]
elif teamsheet_away is None:
teamsheets = read_teamsheets_from_position_data_txt(filepath_position)
teamsheet_away = teamsheets["Away"]
else:
pass
# potential check
# create links
if "xID" not in teamsheet_home.teamsheet.columns:
teamsheet_home.add_xIDs()
if "xID" not in teamsheet_away.teamsheet.columns:
teamsheet_away.add_xIDs()
links_jID_to_xID = {}
links_jID_to_xID["Home"] = teamsheet_home.get_links("jID", "xID")
links_jID_to_xID["Away"] = teamsheet_away.get_links("jID", "xID")
# infer data array shapes
number_of_home_players = max(links_jID_to_xID["Home"].values()) + 1
number_of_away_players = max(links_jID_to_xID["Away"].values()) + 1
number_of_frames = {}
for segment in segments:
number_of_frames[segment] = (
int((periods[segment][1] - periods[segment][0]) / 1000 * framerate_est) + 1
)
# bins
xydata = {}
xydata["Home"] = {
segment: np.full(
[number_of_frames[segment], number_of_home_players * 2], np.nan
)
for segment in segments
}
xydata["Away"] = {
segment: np.full(
[number_of_frames[segment], number_of_away_players * 2], np.nan
)
for segment in segments
}
xydata["Ball"] = {
segment: np.full([number_of_frames[segment], 2], np.nan) for segment in segments
}
# read TXT file from disk
with open(filepath_position, "r") as f:
tracking_data_lines = f.readlines()
# loop
for package in tracking_data_lines:
# read line to get gameclock, player positions and ball info
(
gameclock,
segment,
positions,
ball,
) = _read_position_data_txt_single_line(package)
# check if frame is in any segment
if segment is None:
# skip line if not
continue
else:
# otherwise calculate relative frame (in respective segment)
frame_rel = int((gameclock - periods[segment][0]) / 1000 * framerate_est)
# insert (x,y)-data into np.array
for team in ["Home", "Away"]:
for jID in positions[team].keys():
# map jersey number to array index and infer respective columns
x_col = (links_jID_to_xID[team][int(jID)] - 1) * 2
y_col = (links_jID_to_xID[team][int(jID)] - 1) * 2 + 1
xydata[team][segment][frame_rel, x_col] = positions[team][jID][0]
xydata[team][segment][frame_rel, y_col] = positions[team][jID][1]
# get ball data
xydata["Ball"][segment][frame_rel] = ball.get("position", np.nan)
# create objects
xy_objects = {}
for segment in segments:
xy_objects[segment] = {}
for team in ["Home", "Away", "Ball"]:
xy_objects[segment][team] = XY(
xy=xydata[team][segment],
framerate=framerate_est,
)
teamsheets = {
"Home": teamsheet_home,
"Away": teamsheet_away,
}
# pack objects
data_objects = (
xy_objects,
teamsheets,
)
return data_objects
[docs]def read_event_data_from_url(
url: str,
teamsheet_home: Teamsheet = None,
teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]:
"""Reads a URL containing a StatsPerform events CSV file and extracts the stored
event data, pitch information, and teamsheets.
The event data from the URL is downloaded into a temporary file stored in the
repository's internal root ``.data``-folder and removed afterwards.
Parameters
----------
url: str
URL to the XML file containing the event data.
teamsheet_home: Teamsheet, optional
Teamsheet-object for the home team used to create link dictionaries of the form
`links[pID] = team`. The links are used to map players to the home and away
teams. If given as None (default), teamsheet is extracted from the event data
XML file.
teamsheet_away: Teamsheet, optional
Teamsheet-object for the away team. If given as None (default), teamsheet is
extracted from the event data XML file. See teamsheet_home for details.
Returns
-------
data_objects: Tuple[Dict[str, Dict[str, Events]], Dict[str, Teamsheet], Pitch]
Tuple of (nested) floodlight core objects with shape (events_objects,
teamsheets, pitch).
``events_objects`` is a nested dictionary containing ``Events`` objects for
each team and segment of the form ``events_objects[segment][team] = Events``.
For a typical league match with two halves and teams this dictionary looks like:
``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
Events}}``.
``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
of the form ``teamsheets[team] = Teamsheet``.
``pitch`` is a ``Pitch`` object corresponding to the data.
"""
data_dir = os.path.join(DATA_DIR, "statsperform")
if not os.path.isdir(data_dir):
os.makedirs(data_dir, exist_ok=True)
temp_file = os.path.join(data_dir, "events_temp.xml")
with open(temp_file, "wb") as binary_file:
binary_file.write(download_from_url(url))
events_objects, teamsheets, pitch = read_event_data_xml(
filepath_events=os.path.join(data_dir, temp_file),
teamsheet_home=teamsheet_home,
teamsheet_away=teamsheet_away,
)
data_objects = (events_objects, teamsheets, pitch)
os.remove(os.path.join(data_dir, temp_file))
return data_objects
[docs]def read_position_data_from_url(
url: str,
teamsheet_home: Teamsheet = None,
teamsheet_away: Teamsheet = None,
) -> Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]:
"""Reads a URL from the StatsPerform API (StatsEdgeViewer) containing a position
data TXT file and extracts position data and teamsheets.
The position data from the URL is downloaded into a temporary file stored in the
repository's internal root ``.data``-folder and removed afterwards.
Parameters
----------
url: str or pathlib.Path
URL to the TXT file containing the position data.
teamsheet_home: Teamsheet, optional
Teamsheet-object for the home team used to create link dictionaries of the form
`links[team][jID] = xID`. The links are used to map players to a specific xID in
the respective XY objects. Should be supplied for custom ordering. If given as
None (default), teamsheet is extracted from the position data TXT file and its
xIDs are assigned in order of appearance.
teamsheet_away: Teamsheet, optional
Teamsheet-object for the away team. If given as None (default), teamsheet is
extracted from the position data TXT file. See teamsheet_home for details.
Returns
-------
data_objects: Tuple[Dict[int, Dict[str, XY]], Dict[int, Teamsheet]]
Tuple of (nested) floodlight core objects with shape (xy_objects,
teamsheets).
``xy_objects`` is a nested dictionary containing ``XY`` objects for each team
and segment of the form ``xy_objects[segment][team] = XY``. For a typical
league match with two halves and teams this dictionary looks like:
``{1: {'Home': XY, 'Away': XY}, 2: {'Home': XY, 'Away': XY}}``.
``teamsheets`` is a dictionary containing ``Teamsheet`` objects for each team
of the form ``teamsheets[team] = Teamsheet``.
Notes
-----
Statsperform position data does not contain any player information expect jersey
numbers by default. Thus, the teamsheet objects generated by this method will name
players 'Player i' with i starting at 1. To identify players, use the jersey numbers
of players or provide custom teamsheets (e.g. by parsing teamsheets from the
Statsperform event data or another data provider).
"""
data_dir = os.path.join(DATA_DIR, "statsperform")
if not os.path.isdir(data_dir):
os.makedirs(data_dir, exist_ok=True)
temp_file = os.path.join(data_dir, "tracking_temp.txt")
with open(temp_file, "wb") as binary_file:
binary_file.write(download_from_url(url))
xy_objects, teamsheets = read_position_data_txt(
filepath_position=os.path.join(data_dir, temp_file),
teamsheet_home=teamsheet_home,
teamsheet_away=teamsheet_away,
)
data_objects = (xy_objects, teamsheets)
os.remove(os.path.join(data_dir, temp_file))
return data_objects