Source code for floodlight.io.opta

import re
from pathlib import Path
from typing import Dict, Tuple, Union

import pytz
import iso8601
import pandas as pd
from lxml import etree

from floodlight.core.events import Events
from floodlight.core.pitch import Pitch
from floodlight.io.utils import get_and_convert



[docs]
def get_opta_feedtype(filepath: Union[str, Path]) -> Union[str, None]:
    """Tries to extract the feed type from Opta's XML feed.

    This function assumes that the file follows Opta's format of producing feeds.
    Thus it should have a "PRODUCTION HEADER" comment at the top of the file so that on
    line 6 it reads something like ``production module:  Opta::Feed::XML::Soccer::F24``.

    Parameters
    ----------
    filepath : Union[str, Path]
        Full path to Opta XML file.

    Returns
    -------
    feedtype: str or None
        Returns the type of the feed  as a string in case it  finds it, e.g. 'F24',
        and `None` otherwise.
    """
    with open(str(filepath), "r") as f:
        # iterate through first lines instead of loading entire file to RAM
        for i, line in enumerate(f):
            # search for production module at line 6
            if i == 6:
                production_tags = line.strip().split(":")
                if production_tags[0] == "production module":
                    feedtype = production_tags[-1]
                else:
                    feedtype = None
                break

    return feedtype




[docs]
def read_event_data_xml(
    filepath: Union[str, Path]
) -> Tuple[Dict[str, Dict[str, Events]], Pitch]:
    """Parse Opta's f24 feed (containing match events) and extract event data and pitch
    information.

    This function provides a high-level access to the particular f24 feed and will
    return event objects for both teams. The number of segments is inferred from the
    data, yet data for each segment is stored in a separate object.

    Parameters
    ----------
    filepath: str or pathlib.Path
        Full path to the XML feed.

    Returns
    -------
    data_objects: Tuple[Dict[str, Dict[str, Events]], Pitch]
        Tuple of (nested) floodlight core objects with shape (events_objects,
        pitch).

        ``events_objects`` is a nested dictionary containing ``Events`` objects for
        each team and segment of the form ``events_objects[segment][team] = Events``.
        For a typical league match with two halves and teams this dictionary looks like:
        ``{'HT1': {'Home': Events, 'Away': Events}, 'HT2': {'Home': Events, 'Away':
        Events}}``.

        ``pitch`` is a ``Pitch`` object corresponding to the data.

    Notes
    -----
    Opta's format of handling event data information involves an elaborate use of so
    called qualifiers, which attach additional information to certain events. There
    also exists a number of mappings that define which qualifiers may be attached to
    which kind of events. Parsing this information involves quite a bit of logic and is
    planned to be included in further releases. As of now, qualifier information is
    parsed as a string in the `qualifier` column of the returned DataFrame and can be
    transformed to a dict of the form `{qualifier_id: value}`.
    """
    # check feed type
    if get_opta_feedtype(filepath) != "F24":
        raise ValueError(f"Not an Opta F24 feed: {filepath}")

    # load xml tree into memory
    tree = etree.parse(str(filepath))
    root = tree.getroot()

    # 1. parse match info
    matchinfo = root.xpath("Game")[0].attrib
    teams = ["Home", "Away"]
    tID_link = {
        int(matchinfo["home_team_id"]): "Home",
        int(matchinfo["away_team_id"]): "Away",
    }
    number_of_periods = len(list(filter(re.compile("period_._start").match, matchinfo)))
    segments = [f"HT{period}" for period in range(1, number_of_periods + 1)]

    # 2. parse events
    # bins
    columns = [
        "eID",
        "gameclock",
        "pID",
        "outcome",
        "timestamp",
        "minute",
        "second",
        "at_x",
        "at_y",
        "qualifier",
    ]

    event_lists = {
        team: {segment: {col: [] for col in columns} for segment in segments}
        for team in teams
    }
    directions = {team: {} for team in teams}
    dir_link = {"Left to Right": "lr", "Right to Left": "rl"}
    segment_offsets = {1: 0, 2: 45, 3: 90, 4: 105}
    kickoffs = {}

    # read kickoff events for times and playing direction
    # (NOTE: kickoff times can also be directly found in matchinfo, although the
    # explicit kickoff-event timestamps appear to be more accurate)
    for event in root.xpath("Game/Event[@type_id='32']"):
        # get team and segment information
        period = get_and_convert(event.attrib, "period_id", int)
        segment = "HT" + str(period)
        tID = get_and_convert(event.attrib, "team_id", int)
        team = tID_link[tID]
        # read kickoff times
        kickoff_timestring = get_and_convert(event.attrib, "timestamp", str)
        kickoff_datetime = iso8601.parse_date(
            kickoff_timestring, default_timezone=pytz.utc
        )
        kickoffs[segment] = kickoff_datetime
        # read playing direction
        direction_qualifier = event.xpath("Q[@qualifier_id='127']")
        if len(direction_qualifier) > 0:
            value = get_and_convert(direction_qualifier[0], "value", str)
            direction = dir_link.get(value)
        else:
            direction = None
        directions[team][segment] = direction
        # cut event from tree to prevent double parsing
        # event.getparent().remove(event)

    # loop
    for event in root.xpath("Game/Event"):
        # get team and segment information
        period = get_and_convert(event.attrib, "period_id", int)
        segment = "HT" + str(period)
        tID = get_and_convert(event.attrib, "team_id", int)
        team = tID_link[tID]
        # skip match-unrelated events
        if period not in range(1, 6):
            continue

        # identifier and outcome:
        eID = get_and_convert(event.attrib, "type_id", int)
        # skip unwanted events
        if eID in [30]:
            continue
        pID = get_and_convert(event.attrib, "player_id", int)
        outcome = get_and_convert(event.attrib, "outcome", int)
        event_lists[team][segment]["eID"].append(eID)
        event_lists[team][segment]["pID"].append(pID)
        event_lists[team][segment]["outcome"].append(outcome)

        # absolute and relative time
        event_timestring = get_and_convert(event.attrib, "timestamp", str)
        minute = get_and_convert(event.attrib, "min", int)
        # transform minute to be relative to current segment
        minute -= segment_offsets[period]
        second = get_and_convert(event.attrib, "sec", int)
        timestamp = iso8601.parse_date(event_timestring, default_timezone=pytz.utc)
        delta = timestamp - kickoffs[segment]
        gameclock = delta.total_seconds()
        # re-adjust pre-kick-off events (e.g. substitutions) to 00:00
        gameclock = max(gameclock, 0.0)
        event_lists[team][segment]["timestamp"].append(timestamp)
        event_lists[team][segment]["minute"].append(minute)
        event_lists[team][segment]["second"].append(second)
        event_lists[team][segment]["gameclock"].append(gameclock)

        # location
        at_x = get_and_convert(event.attrib, "x", float)
        at_y = get_and_convert(event.attrib, "y", float)
        event_lists[team][segment]["at_x"].append(at_x)
        event_lists[team][segment]["at_y"].append(at_y)

        # qualifier
        qual_dict = {}
        for qualifier in event.iterchildren():
            qual_id = int(qualifier.attrib["qualifier_id"])
            qual_value = qualifier.attrib.get("value")
            qual_dict[qual_id] = qual_value
        event_lists[team][segment]["qualifier"].append(str(qual_dict))

    # create objects
    events_objects = {}
    for segment in segments:
        events_objects[segment] = {}
        for team in ["Home", "Away"]:
            events_objects[segment][team] = Events(
                events=pd.DataFrame(data=event_lists[team][segment]),
                direction=directions[team][segment],
            )
    pitch = Pitch.from_template("opta", sport="football")

    # pack objects
    data_objects = (events_objects, pitch)

    return data_objects