Source code for floodlight.io.opta

import re
from pathlib import Path
from typing import Dict, Tuple, Union

import pytz
import iso8601
import pandas as pd
from lxml import etree

from floodlight.core.events import Events
from floodlight.core.pitch import Pitch
from floodlight.io.utils import get_and_convert


[docs]def get_opta_feedtype(filepath: Union[str, Path]) -> Union[str, None]: """Tries to extract the feed type from Opta's XML feed. This function assumes that the file follows Opta's format of producing feeds. Thus it should have a "PRODUCTION HEADER" comment at the top of the file so that on line 6 it reads something like ``production module: Opta::Feed::XML::Soccer::F24``. Parameters ---------- filepath : Union[str, Path] Full path to Opta XML file. Returns ------- feedtype: str or None Returns the type of the feed as a string in case it finds it, e.g. 'F24', and `None` otherwise. """ with open(str(filepath), "r") as f: # iterate through first lines instead of loading entire file to RAM for i, line in enumerate(f): # search for production module at line 6 if i == 6: production_tags = line.strip().split(":") if production_tags[0] == "production module": feedtype = production_tags[-1] else: feedtype = None break return feedtype
[docs]def read_event_data_xml( filepath: Union[str, Path] ) -> Tuple[Dict[str, Dict[str, Events]], Pitch]: """Parse Opta's f24 feed (containing match events) and extract event data and pitch information. This function provides a high-level access to the particular f24 feed and will return event objects for both teams. The number of segments is inferred from the data, yet data for each segment is stored in a separate object. Parameters ---------- filepath: str or pathlib.Path Full path to the XML feed. Returns ------- data_objects: Tuple[Dict[str, Dict[str, Events]], Pitch] Tuple of (nested) floodlight core objects with shape (events_objects, pitch). ``events_objects`` is a nested dictionary containing ``Events`` objects for each team and segment of the form ``events_objects[segment][team] = Events``. For a typical league match with two halves and teams this dictionary looks like: ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away': Events}}``. ``pitch`` is a ``Pitch`` object corresponding to the data. Notes ----- Opta's format of handling event data information involves an elaborate use of so called qualifiers, which attach additional information to certain events. There also exist a number of mappings that define which qualifiers may be attached to which kind of events. Parsing this information involves quite a bit of logic and is planned to be inclucded in further releases. As of now, qualifier information is parsed as a string in the `qualifier` column of the returned DataFrame and can be transformed to a dict of the form `{qualifier_id: value}`. """ # check feed type if get_opta_feedtype(filepath) != "F24": raise ValueError(f"Not an Opta F24 feed: {filepath}") # load xml tree into memory tree = etree.parse(str(filepath)) root = tree.getroot() # 1. parse match info matchinfo = root.xpath("Game")[0].attrib teams = ["Home", "Away"] tID_link = { int(matchinfo["home_team_id"]): "Home", int(matchinfo["away_team_id"]): "Away", } number_of_periods = len(list(filter(re.compile("period_._start").match, matchinfo))) segments = [f"HT{period}" for period in range(1, number_of_periods + 1)] # 2. parse events # bins columns = [ "eID", "gameclock", "pID", "outcome", "timestamp", "minute", "second", "at_x", "at_y", "qualifier", ] event_lists = { team: {segment: {col: [] for col in columns} for segment in segments} for team in teams } directions = {team: {} for team in teams} dir_link = {"Left to Right": "lr", "Right to Left": "rl"} segment_offsets = {1: 0, 2: 45, 3: 90, 4: 105} kickoffs = {} # read kickoff events for times and playing direction # (NOTE: kickoff times can also be directly found in matchinfo, although the # explicit kickoff-event timestamps appear to be more accurate) for event in root.xpath("Game/Event[@type_id='32']"): # get team and segment information period = get_and_convert(event.attrib, "period_id", int) segment = "HT" + str(period) tID = get_and_convert(event.attrib, "team_id", int) team = tID_link[tID] # read kickoff times kickoff_timestring = get_and_convert(event.attrib, "timestamp", str) kickoff_datetime = iso8601.parse_date( kickoff_timestring, default_timezone=pytz.utc ) kickoffs[segment] = kickoff_datetime # read playing direction direction_qualifier = event.xpath("Q[@qualifier_id='127']") if len(direction_qualifier) > 0: value = get_and_convert(direction_qualifier[0], "value", str) direction = dir_link.get(value) else: direction = None directions[team][segment] = direction # cut event from tree to prevent double parsing # event.getparent().remove(event) # loop for event in root.xpath("Game/Event"): # get team and segment information period = get_and_convert(event.attrib, "period_id", int) segment = "HT" + str(period) tID = get_and_convert(event.attrib, "team_id", int) team = tID_link[tID] # skip match-unrelated events if period not in range(1, 6): continue # identifier and outcome: eID = get_and_convert(event.attrib, "type_id", int) # skip unwanted events if eID in [30]: continue pID = get_and_convert(event.attrib, "player_id", int) outcome = get_and_convert(event.attrib, "outcome", int) event_lists[team][segment]["eID"].append(eID) event_lists[team][segment]["pID"].append(pID) event_lists[team][segment]["outcome"].append(outcome) # absolute and relative time event_timestring = get_and_convert(event.attrib, "timestamp", str) minute = get_and_convert(event.attrib, "min", int) # transform minute to be relative to current segment minute -= segment_offsets[period] second = get_and_convert(event.attrib, "sec", int) timestamp = iso8601.parse_date(event_timestring, default_timezone=pytz.utc) delta = timestamp - kickoffs[segment] gameclock = delta.total_seconds() # re-adjust pre-kick-off events (e.g. substitutions) to 00:00 gameclock = max(gameclock, 0.0) event_lists[team][segment]["timestamp"].append(timestamp) event_lists[team][segment]["minute"].append(minute) event_lists[team][segment]["second"].append(second) event_lists[team][segment]["gameclock"].append(gameclock) # location at_x = get_and_convert(event.attrib, "x", float) at_y = get_and_convert(event.attrib, "y", float) event_lists[team][segment]["at_x"].append(at_x) event_lists[team][segment]["at_y"].append(at_y) # qualifier qual_dict = {} for qualifier in event.iterchildren(): qual_id = int(qualifier.attrib["qualifier_id"]) qual_value = qualifier.attrib.get("value") qual_dict[qual_id] = qual_value event_lists[team][segment]["qualifier"].append(str(qual_dict)) # create objects events_objects = {} for segment in segments: events_objects[segment] = {} for team in ["Home", "Away"]: events_objects[segment][team] = Events( events=pd.DataFrame(data=event_lists[team][segment]), direction=directions[team][segment], ) pitch = Pitch.from_template("opta", sport="football") # pack objects data_objects = (events_objects, pitch) return data_objects