Source code for cricinfo.cricinfo

from io import StringIO

import pandas as pd
import requests

from .helpers.constants import (
    BASE_URL,
    HEADERS,
    MATCH_FORMAT_PARAMETER,
    PAGE_PARAMETER,
    TEAM_PARAMETER,
    TEMPLATE_PARAMETER,
    TEMPLATE_VALUE,
    TYPE_PARAMETER,
)
from .helpers.data_sanitizer import clean_nan_column
from .match_format import MatchFormat
from .stat_type import StatType
from .team import Team



[docs]
class Cricinfo:
    """Loads cricket statistics from ESPN Cricinfo into pandas DataFrames.

    Example::

        from cricinfo import Cricinfo, Team, MatchFormat, StatType

        df = Cricinfo.retrieve_stats(
            team=Team.Pakistan,
            match_format=MatchFormat.T20I,
            stat_type=StatType.BATTING,
        )
    """


[docs]
    @staticmethod
    def retrieve_stats(
        team: Team | None,
        match_format: MatchFormat,
        stat_type: StatType,
    ) -> pd.DataFrame:
        """Retrieve statistics from ESPN Cricinfo.

        :param team: Filter by team, or ``None`` for all teams.
        :param match_format: The match format to filter by.
        :param stat_type: The type of statistics to retrieve.
        :returns: A DataFrame containing the requested statistics.
        :raises requests.HTTPError: If the HTTP request fails.
        :raises ValueError: If the response cannot be parsed.
        """
        params = Cricinfo._build_params(team, match_format, stat_type)
        dataframes = []

        tables = Cricinfo._fetch_page(params, page=1)
        dataframes.append(clean_nan_column(tables[2]))

        num_pages = Cricinfo._parse_page_count(tables)
        for page in range(2, num_pages + 1):
            tables = Cricinfo._fetch_page(params, page)
            dataframes.append(clean_nan_column(tables[2]))

        return pd.concat(dataframes, ignore_index=True)


    @staticmethod
    def _build_params(team, match_format, stat_type):
        params = {
            MATCH_FORMAT_PARAMETER: match_format.value,
            TYPE_PARAMETER: stat_type.value,
            TEMPLATE_PARAMETER: TEMPLATE_VALUE,
        }
        if team is not None:
            params[TEAM_PARAMETER] = team.value
        return params

    @staticmethod
    def _fetch_page(params, page):
        params[PAGE_PARAMETER] = str(page)
        response = requests.get(BASE_URL, headers=HEADERS, params=params)
        response.raise_for_status()

        tables = pd.read_html(StringIO(response.text))
        if len(tables) < 3:
            raise ValueError(
                f"Expected at least 3 tables in response, got {len(tables)}. "
                "The ESPN Cricinfo page structure may have changed."
            )
        return tables

    @staticmethod
    def _parse_page_count(tables):
        try:
            return int(tables[1][0][0].split()[-1])
        except (IndexError, ValueError, KeyError) as e:
            raise ValueError(
                "Could not determine page count from response. "
                "The ESPN Cricinfo page structure may have changed."
            ) from e