Source code for bioregistry.pandas

"""Utilities for processing tabular data in Pandas dataframes.

The following examples show how the entries in the widely used `Gene Ontology Annotations
<http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/#>`_ database distributed
in the `GAF format <http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/>`_ can
be loaded with :mod:`pandas` then normalized with the Bioregistry. It can be loaded in full
with the :func:`get_goa_example` function.
"""

import functools
import logging
import re
from typing import Dict, Optional, Pattern, Union

import pandas as pd
from tabulate import tabulate
from tqdm.auto import tqdm

import bioregistry

__all__ = [
    "get_goa_example",
    # Normalization
    "normalize_prefixes",
    "normalize_curies",
    # Validation
    "validate_prefixes",
    "validate_curies",
    "validate_identifiers",
    # Conversion
    "identifiers_to_curies",
    "identifiers_to_iris",
    "curies_to_iris",
    "curies_to_identifiers",
    "iris_to_curies",
]

logger = logging.getLogger(__name__)


class PrefixLocationError(ValueError):
    """Raised when not exactly one of prefix and prefix_column were given."""


[docs]def get_goa_example() -> pd.DataFrame:
    """Get the GOA file."""
    return pd.read_csv(
        "http://geneontology.org/gene-associations/goa_human.gaf.gz",
        sep="\t",
        comment="!",
        header=None,
    )


def _norm_column(df: pd.DataFrame, column: Union[int, str]) -> str:
    return column if isinstance(column, str) else df.columns[column]


[docs]def normalize_prefixes(
    df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
    """Normalize prefixes in a given column.

    :param df: A dataframe
    :param column: A column in the dataframe containing prefixes
    :param target_column: The target column to put the normalized prefixes. If not given,
        overwrites the given ``column`` in place

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 1: DB
        #  i.e., `UniProtKB` becomes `uniprot`
        brpd.normalize_prefixes(df, column=0)
    """
    column = _norm_column(df, column)
    if target_column is None:
        target_column = column
    df[target_column] = df[column].map(bioregistry.normalize_prefix, na_action="ignore")


[docs]def normalize_curies(
    df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
    """Normalize CURIEs in a given column.

    :param df: A dataframe
    :param column: The column of CURIEs to normalize
    :param target_column:
        The column to put the normalized CURIEs in. If not given, overwrites the given ``column`` in place.

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 5: GO ID - fix normalization of capitalization of prefix,
        #  i.e., `GO:0003993` becomes `go:0003993`
        brpd.normalize_curies(df, column=4)

        # column 6: DB:Reference (|DB:Reference) - fix synonym of prefix
        #  i.e., `PMID:2676709` becomes `pubmed:2676709`
        brpd.normalize_curies(df, column=5)

        # column 8: With (or) From
        #  i.e., `GO:0000346` becomes `go:0000346`
        brpd.normalize_curies(df, column=7)

        # column 13: Taxon(|taxon) - fix synonym of prefix
        #  i.e., `taxon:9606` becomes `ncbitaxon:9606`
        brpd.normalize_curies(df, column=12)
    """
    column = _norm_column(df, column)
    if target_column is None:
        target_column = column
    df[target_column] = df[column].map(bioregistry.normalize_curie, na_action="ignore")


[docs]def validate_prefixes(
    df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> pd.Series:
    """Validate prefixes in a given column.

    :param df: A DataFrame
    :param column: The column of prefixes to validate
    :param target_column:
        The optional column to put the results of validation
    :returns:
        A pandas series corresponding to the validity of each row

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 1: DB
        #  i.e., `UniProtKB` entries are not standard, and are therefore false
        idx = brpd.validate_prefixes(df, column=0)

        # Slice the dataframe based on valid and invalid prefixes
        valid_prefix_df = df[idx]
        invalid_prefix_df = df[~idx]
    """
    column = _norm_column(df, column)
    results = df[column].map(lambda x: bioregistry.normalize_prefix(x) == x, na_action="ignore")
    if target_column:
        df[target_column] = results
    return results


def summarize_prefix_validation(df: pd.DataFrame, idx: pd.Series) -> None:
    """Provide a summary of prefix validation."""
    # TODO add suggestions on what to do next, e.g.:,
    #  1. can some be normalized? use normalization function
    #  2. slice out invalid content
    #  3. make new prefix request to Bioregistry
    count = (~idx).sum()
    unique = sorted(df[~idx][0].unique())

    print(  # noqa:T201
        f"{count:,} of {len(df.index):,} ({count / len(df.index):.0%})",
        "rows with the following prefixes need to be fixed:",
        unique,
    )
    normalizable = {
        prefix: norm_prefix
        for prefix, norm_prefix in (
            (prefix, bioregistry.normalize_prefix(prefix)) for prefix in unique
        )
        if norm_prefix
    }
    if normalizable:
        print(  # noqa:T201
            f"The following prefixes could be normalized using normalize_curies():"
            f"\n\n{tabulate(normalizable.items(), headers=['raw', 'standardized'], tablefmt='github')}"
        )


[docs]def validate_curies(
    df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> pd.Series:
    """Validate CURIEs in a given column.

    :param df: A DataFrame
    :param column: The column of CURIEs to validate
    :param target_column:
        The optional column to put the results of validation.
    :returns:
        A pandas series corresponding to the validity of each row

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 5: GO ID - fix normalization of capitalization of prefix,
        #  i.e., `GO:0003993` is not standard and is therefore false
        idx = brpd.validate_curies(df, column=4)

        # Slice the dataframe
        valid_go_df = df[idx]
        invalid_go_df = df[~idx]
    """
    column = _norm_column(df, column)
    results = df[column].map(bioregistry.is_valid_curie, na_action="ignore")
    if target_column:
        df[target_column] = results
    return results


def summarize_curie_validation(df, idx) -> None:
    """Provide a summary of CURIE validation."""
    count = (~idx).sum()
    unique = sorted(df[~idx][0].unique())
    print(  # noqa:T201
        f"{count:,} of {len(df.index):,} ({count / len(df.index):.0%})",
        "rows with the following CURIEs need to be fixed:",
        unique,
    )


[docs]def validate_identifiers(
    df: pd.DataFrame,
    column: Union[int, str],
    *,
    prefix: Optional[str] = None,
    prefix_column: Optional[str] = None,
    target_column: Optional[str] = None,
    use_tqdm: bool = False,
) -> pd.Series:
    """Validate local unique identifiers in a given column.

    Some data sources split the prefix and identifier in separate columns,
    so you can use the ``prefix_column`` argument instead of the ``prefix``
    argument like in the following example with the GO Annotation Database:

    :param df: A dataframe
    :param column: A column in the dataframe containing identifiers
    :param prefix:
        Specify the prefix if all identifiers in the given column are from
        the same namespace
    :param prefix_column:
        Specify the ``prefix_column`` if there is an additional column whose rows
        contain the prefix for each rows' respective identifiers.
    :param target_column:
        If given, stores the results of validation in this column
    :param use_tqdm:
        Should a progress bar be shown?
    :returns:
        A pandas series corresponding to the validity of each row
    :raises PrefixLocationError:
        If not exactly one of the prefix and prefix_column arguments are given
    :raises ValueError:
        If prefix_column is given and it contains no valid prefixes

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # Use a combination of column 1 (DB) and column 2 (DB Object ID) for validation
        idx = brpd.validate_identifiers(df, column=1, prefix_column=0)

        # Split the dataframe based on valid and invalid identifiers
        valid_df = df[idx]
        invalid_df = df[~idx]
    """
    column = _norm_column(df, column)
    if prefix_column is None and prefix is None:
        raise PrefixLocationError
    elif prefix_column is not None and prefix is not None:
        raise PrefixLocationError
    elif prefix is not None:
        return _help_validate_identifiers(df, column, prefix)
    else:  # prefix_column is not None
        prefixes = df[prefix_column].unique()
        if 0 == len(prefixes):
            raise ValueError(f"No prefixes found in column {prefix_column}")
        if 1 == len(prefixes):
            return _help_validate_identifiers(df, column, list(prefixes)[0])
        patterns: Dict[str, Optional[Pattern]] = {}
        for prefix in df[prefix_column].unique():
            if pd.isna(prefix):
                continue
            pattern = bioregistry.get_pattern(prefix)
            patterns[prefix] = re.compile(pattern) if pattern else None

        def _validate_lambda(_p: Optional[str], _i: str) -> Optional[bool]:
            if _p is None:
                return None
            _pattern = patterns.get(_p)
            if _pattern is None:
                return None
            return bool(_pattern.fullmatch(_i))

        results = _multi_column_map(
            df,
            [prefix_column, column],
            _validate_lambda,
            use_tqdm=use_tqdm,
        )
    if target_column:
        df[target_column] = results
    return results


def _help_validate_identifiers(df, column, prefix):
    norm_prefix = bioregistry.normalize_prefix(prefix)
    if norm_prefix is None:
        raise ValueError(
            f"Can't validate identifiers for {prefix} because it is not in the Bioregistry"
        )
    pattern = bioregistry.get_pattern(prefix)
    if pattern is None:
        raise ValueError(
            f"Can't validate identifiers for {prefix} because it has no pattern in the Bioregistry"
        )
    pattern_re = re.compile(pattern)
    return df[column].map(
        lambda s: bool(pattern_re.fullmatch(s)),
        na_action="ignore",
    )


[docs]def identifiers_to_curies(
    df: pd.DataFrame,
    column: Union[int, str],
    *,
    prefix: Optional[str] = None,
    prefix_column: Union[None, int, str] = None,
    target_column: Optional[str] = None,
    use_tqdm: bool = False,
    normalize_prefixes_: bool = True,
) -> None:
    """Convert a column of local unique identifiers to CURIEs.

    :param df: A dataframe
    :param column: A column in the dataframe containing identifiers
    :param prefix:
        Specify the prefix if all identifiers in the given column are from
        the same namespace
    :param prefix_column:
        Specify the ``prefix_column`` if there is an additional column whose rows
        contain the prefix for each rows' respective identifiers.
    :param target_column:
        If given, stores CURIEs in this column,
    :param use_tqdm:
        Should a progress bar be shown?
    :param normalize_prefixes_:
        Should the prefix column get auto-normalized if ``prefix_column`` is not None?
    :raises PrefixLocationError:
        If not exactly one of the prefix and prefix_column arguments are given
    :raises ValueError:
        If the given prefix is not normalizable

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # Use a combination of column 1 (DB) and column 2 (DB Object ID) for conversion
        brpd.identifiers_to_curies(df, column=1, prefix_column=0)
    """
    # FIXME do pattern check first so you don't get bananas
    column = _norm_column(df, column)
    if prefix_column is None and prefix is None:
        raise PrefixLocationError
    elif prefix_column is not None and prefix is not None:
        raise PrefixLocationError

    # valid_idx = validate_identifiers(df, column=column, prefix=prefix, prefix_column=prefix_column)
    target_column = target_column or column

    if prefix is not None:
        norm_prefix = bioregistry.normalize_prefix(prefix)
        if norm_prefix is None:
            raise ValueError

        df.loc[target_column] = df[column].map(
            functools.partial(bioregistry.curie_to_str, prefix=norm_prefix),
            na_action="ignore",
        )
    elif prefix_column is not None:
        prefix_column = _norm_column(df, prefix_column)
        if normalize_prefixes_:
            normalize_prefixes(df=df, column=prefix_column)
        df[target_column] = _multi_column_map(
            df, [prefix_column, column], bioregistry.curie_to_str, use_tqdm=use_tqdm
        )


[docs]def identifiers_to_iris(
    df: pd.DataFrame,
    column: Union[int, str],
    *,
    prefix: str,
    prefix_column: Optional[str] = None,
    target_column: Optional[str] = None,
    use_tqdm: bool = False,
) -> None:
    """Convert a column of local unique identifiers to IRIs.

    :param df: A dataframe
    :param column: A column in the dataframe containing identifiers
    :param prefix:
        Specify the prefix if all identifiers in the given column are from
        the same namespace
    :param prefix_column:
        Specify the ``prefix_column`` if there is an additional column whose rows
        contain the prefix for each rows' respective identifiers.
    :param target_column:
        If given, stores IRIs in this column
    :param use_tqdm:
        Should a progress bar be shown?

    :raises PrefixLocationError:
        If not exactly one of the prefix and prefix_column arguments are given
    :raises ValueError:
        If the given prefix is not normalizable

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # Use a combination of column 1 (DB) and column 2 (DB Object ID) for conversion
        brpd.identifiers_to_iris(df, column=1, prefix_column=0)
    """
    column = _norm_column(df, column)
    if prefix_column is None and prefix is None:
        raise PrefixLocationError
    elif prefix_column is not None and prefix is not None:
        raise PrefixLocationError
    elif prefix is not None:
        norm_prefix = bioregistry.normalize_prefix(prefix)
        if norm_prefix is None:
            raise ValueError
        df[target_column or column] = df[column].map(
            functools.partial(bioregistry.get_iri, prefix=norm_prefix), na_action="ignore"
        )
    else:  # prefix_column is not None
        prefix_column = _norm_column(df, prefix_column)
        df[target_column or column] = _multi_column_map(
            df, [prefix_column, column], bioregistry.get_iri, use_tqdm=use_tqdm
        )


def _multi_column_map(df, columns, func, *, use_tqdm: bool = False):
    rows = df[columns].values
    if use_tqdm:
        rows = tqdm(rows, unit_scale=True)
    return pd.Series(
        [func(*row) if all(pd.notna(cell) for cell in row) else None for row in rows],
        index=df.index,
    )


[docs]def curies_to_iris(
    df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
    """Convert a column of CURIEs to IRIs.

    :param df: A dataframe
    :param column: A column in the dataframe containing CURIEs
    :param target_column:
        If given, stores the IRIs in this column. Otherwise, overwrites the
        given column in place.

    .. seealso:: :func:`iris_to_curies`
    """
    column = _norm_column(df, column)
    df[target_column or column] = df[column].map(bioregistry.get_iri, na_action="ignore")


[docs]def curies_to_identifiers(
    df: pd.DataFrame,
    column: Union[int, str],
    *,
    target_column: Optional[str] = None,
    prefix_column_name: Optional[str] = None,
) -> None:
    """Split a CURIE column into a prefix and local identifier column.

    By default, the local identifier stays in the same column unless target_column is given.
    If prefix_column_name isn't given, it's derived from the target column (if labels available)
    or just appended to the end if not

    :param df: A dataframe
    :param column: A column in the dataframe containing CURIEs
    :param target_column:
        If given, stores identifiers in this column. Else, stores in the given column
    :param prefix_column_name:
        If given, stores prefixes in this column. Else, derives the column name from the
        target column name.
    :raises ValueError:
        If no prefix_column_name is given and the auto-generated name conflicts with a column
        already in the dataframe.

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 5: GO ID - convert CURIEs directly to IRIs
        #  i.e., `GO:0003993` becomes `http://amigo.geneontology.org/amigo/term/GO:0003993`
        brpd.curies_to_identifiers(df, column=4)
    """
    column = _norm_column(df, column)
    if target_column is None:
        target_column = column
    if prefix_column_name is None:
        prefix_column_name = f"{target_column}_prefix"
        if prefix_column_name in df.columns:
            raise ValueError(
                "auto-generated prefix column is already present. please specify explicitly."
            )

    prefixes, identifiers = zip(*df[column].map(bioregistry.parse_curie, na_action="ignore"))
    df[prefix_column_name] = prefixes
    df[target_column] = identifiers


[docs]def iris_to_curies(
    df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
    """Convert a column of IRIs to CURIEs.

    :param df: A dataframe
    :param column: A column in the dataframe containing IRIs
    :param target_column:
        If given, stores the CURIEs in this column. Otherwise, overwrites the
        given column in place.

    .. seealso:: :func:`curies_to_iris`
    """
    column = _norm_column(df, column)
    df[target_column or column] = df[column].map(bioregistry.curie_from_iri, na_action="ignore")