Source code for bioregistry.pandas

"""Utilities for processing tabular data in Pandas dataframes.

The following examples show how the entries in the widely used `Gene Ontology Annotations
<http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/#>`_ database distributed
in the `GAF format <http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/>`_ can
be loaded with :mod:`pandas` then normalized with the Bioregistry. It can be loaded in full
with the :func:`get_goa_example` function.
"""

from __future__ import annotations

import functools
import logging
import re
from collections.abc import Callable
from re import Pattern
from typing import TypeVar, cast

import pandas as pd
from tabulate import tabulate
from tqdm.auto import tqdm

import bioregistry
from bioregistry.constants import MaybeCURIE

__all__ = [
    "curies_to_identifiers",
    "curies_to_iris",
    "get_goa_example",
    "identifiers_to_curies",
    "identifiers_to_iris",
    "iris_to_curies",
    "normalize_curies",
    "normalize_prefixes",
    "validate_curies",
    "validate_identifiers",
    "validate_prefixes",
]

logger = logging.getLogger(__name__)

X = TypeVar("X")


class PrefixLocationError(ValueError):
    """Raised when not exactly one of prefix and prefix_column were given."""



[docs]
def get_goa_example() -> pd.DataFrame:
    """Get the GOA file."""
    return pd.read_csv(
        "http://geneontology.org/gene-associations/goa_human.gaf.gz",
        sep="\t",
        comment="!",
        header=None,
    )



def _norm_column(df: pd.DataFrame, column: int | str) -> str:
    return column if isinstance(column, str) else df.columns[column]



[docs]
def normalize_prefixes(
    df: pd.DataFrame, column: int | str, *, target_column: str | None = None
) -> None:
    """Normalize prefixes in a given column.

    :param df: A dataframe
    :param column: A column in the dataframe containing prefixes
    :param target_column: The target column to put the normalized prefixes. If not given,
        overwrites the given ``column`` in place

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 1: DB
        #  i.e., `UniProtKB` becomes `uniprot`
        brpd.normalize_prefixes(df, column=0)
    """
    column = _norm_column(df, column)
    if target_column is None:
        target_column = column
    df[target_column] = df[column].map(bioregistry.normalize_prefix, na_action="ignore")




[docs]
def normalize_curies(
    df: pd.DataFrame, column: int | str, *, target_column: str | None = None
) -> None:
    """Normalize CURIEs in a given column.

    :param df: A dataframe
    :param column: The column of CURIEs to normalize
    :param target_column:
        The column to put the normalized CURIEs in. If not given, overwrites the given ``column`` in place.

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 5: GO ID - fix normalization of capitalization of prefix,
        #  i.e., `GO:0003993` becomes `go:0003993`
        brpd.normalize_curies(df, column=4)

        # column 6: DB:Reference (|DB:Reference) - fix synonym of prefix
        #  i.e., `PMID:2676709` becomes `pubmed:2676709`
        brpd.normalize_curies(df, column=5)

        # column 8: With (or) From
        #  i.e., `GO:0000346` becomes `go:0000346`
        brpd.normalize_curies(df, column=7)

        # column 13: Taxon(|taxon) - fix synonym of prefix
        #  i.e., `taxon:9606` becomes `ncbitaxon:9606`
        brpd.normalize_curies(df, column=12)
    """
    column = _norm_column(df, column)
    if target_column is None:
        target_column = column
    df[target_column] = df[column].map(bioregistry.normalize_curie, na_action="ignore")




[docs]
def validate_prefixes(
    df: pd.DataFrame, column: int | str, *, target_column: str | None = None
) -> pd.Series[str]:
    """Validate prefixes in a given column.

    :param df: A DataFrame
    :param column: The column of prefixes to validate
    :param target_column:
        The optional column to put the results of validation
    :returns:
        A pandas series corresponding to the validity of each row

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 1: DB
        #  i.e., `UniProtKB` entries are not standard, and are therefore false
        idx = brpd.validate_prefixes(df, column=0)

        # Slice the dataframe based on valid and invalid prefixes
        valid_prefix_df = df[idx]
        invalid_prefix_df = df[~idx]
    """
    column = _norm_column(df, column)
    results = df[column].map(lambda x: bioregistry.normalize_prefix(x) == x, na_action="ignore")
    if target_column:
        df[target_column] = results
    return results



def summarize_prefix_validation(df: pd.DataFrame, idx: pd.Series[str]) -> None:
    """Provide a summary of prefix validation."""
    # TODO add suggestions on what to do next, e.g.:,
    #  1. can some be normalized? use normalization function
    #  2. slice out invalid content
    #  3. make new prefix request to Bioregistry
    count = (~idx).sum()
    unique = sorted(df[~idx][0].unique())

    print(  # noqa:T201
        f"{count:,} of {len(df.index):,} ({count / len(df.index):.0%})",
        "rows with the following prefixes need to be fixed:",
        unique,
    )
    normalizable = {
        prefix: norm_prefix
        for prefix, norm_prefix in (
            (prefix, bioregistry.normalize_prefix(prefix)) for prefix in unique
        )
        if norm_prefix
    }
    if normalizable:
        print(  # noqa:T201
            f"The following prefixes could be normalized using normalize_curies():"
            f"\n\n{tabulate(normalizable.items(), headers=['raw', 'standardized'], tablefmt='github')}"
        )



[docs]
def validate_curies(
    df: pd.DataFrame, column: int | str, *, target_column: str | None = None
) -> pd.Series[str]:
    """Validate CURIEs in a given column.

    :param df: A DataFrame
    :param column: The column of CURIEs to validate
    :param target_column:
        The optional column to put the results of validation.
    :returns:
        A pandas series corresponding to the validity of each row

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 5: GO ID - fix normalization of capitalization of prefix,
        #  i.e., `GO:0003993` is not standard and is therefore false
        idx = brpd.validate_curies(df, column=4)

        # Slice the dataframe
        valid_go_df = df[idx]
        invalid_go_df = df[~idx]
    """
    column = _norm_column(df, column)
    results = df[column].map(bioregistry.is_valid_curie, na_action="ignore")
    if target_column:
        df[target_column] = results
    return cast("pd.Series[str]", results)



def summarize_curie_validation(df: pd.DataFrame, idx: pd.Series[str]) -> None:
    """Provide a summary of CURIE validation."""
    count = (~idx).sum()
    unique = sorted(df[~idx][0].unique())
    print(  # noqa:T201
        f"{count:,} of {len(df.index):,} ({count / len(df.index):.0%})",
        "rows with the following CURIEs need to be fixed:",
        unique,
    )



[docs]
def validate_identifiers(
    df: pd.DataFrame,
    column: int | str,
    *,
    prefix: str | None = None,
    prefix_column: str | None = None,
    target_column: str | None = None,
    use_tqdm: bool = False,
) -> pd.Series[bool]:
    """Validate local unique identifiers in a given column.

    Some data sources split the prefix and identifier in separate columns,
    so you can use the ``prefix_column`` argument instead of the ``prefix``
    argument like in the following example with the GO Annotation Database:

    :param df: A dataframe
    :param column: A column in the dataframe containing identifiers
    :param prefix:
        Specify the prefix if all identifiers in the given column are from
        the same namespace
    :param prefix_column:
        Specify the ``prefix_column`` if there is an additional column whose rows
        contain the prefix for each rows' respective identifiers.
    :param target_column:
        If given, stores the results of validation in this column
    :param use_tqdm:
        Should a progress bar be shown?
    :returns:
        A pandas boolean series corresponding to the validity of each row
    :raises PrefixLocationError:
        If not exactly one of the prefix and prefix_column arguments are given
    :raises ValueError:
        If prefix_column is given and it contains no valid prefixes

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # Use a combination of column 1 (DB) and column 2 (DB Object ID) for validation
        idx = brpd.validate_identifiers(df, column=1, prefix_column=0)

        # Split the dataframe based on valid and invalid identifiers
        valid_df = df[idx]
        invalid_df = df[~idx]
    """
    column = _norm_column(df, column)
    if prefix_column is None and prefix is None:
        raise PrefixLocationError
    elif prefix_column is not None and prefix is not None:
        raise PrefixLocationError
    elif prefix is not None:
        return _help_validate_identifiers(df, column, prefix)
    else:  # prefix_column is not None
        prefixes = df[prefix_column].unique()
        if 0 == len(prefixes):
            raise ValueError(f"No prefixes found in column {prefix_column}")
        if 1 == len(prefixes):
            return _help_validate_identifiers(df, column, next(iter(prefixes)))
        patterns: dict[str, Pattern[str] | None] = {}
        for prefix in df[prefix_column].unique():
            if pd.isna(prefix):
                continue
            pattern = bioregistry.get_pattern(prefix)
            patterns[prefix] = re.compile(pattern) if pattern else None

        def _validate_lambda(_p: str | None, _i: str) -> bool | None:
            if _p is None:
                return None
            _pattern = patterns.get(_p)
            if _pattern is None:
                return None
            return bool(_pattern.fullmatch(_i))

        # pandas has its own internal notion of none's,
        # so even though this should be a pd.Series[bool | None],
        # we squash it down
        results = cast(
            "pd.Series[bool]",
            _multi_column_map(
                df,
                [cast(str, prefix_column), column],
                _validate_lambda,
                use_tqdm=use_tqdm,
            ),
        )
    if target_column:
        df[target_column] = results
    return results



def _help_validate_identifiers(df: pd.DataFrame, column: str, prefix: str) -> pd.Series[bool]:
    norm_prefix = bioregistry.normalize_prefix(prefix)
    if norm_prefix is None:
        raise ValueError(
            f"Can't validate identifiers for {prefix} because it is not in the Bioregistry"
        )
    pattern = bioregistry.get_pattern(prefix)
    if pattern is None:
        raise ValueError(
            f"Can't validate identifiers for {prefix} because it has no pattern in the Bioregistry"
        )
    pattern_re = re.compile(pattern)
    return cast(
        pd.Series[bool],
        df[column].map(
            lambda s: bool(pattern_re.fullmatch(s)),
            na_action="ignore",
        ),
    )



[docs]
def identifiers_to_curies(
    df: pd.DataFrame,
    column: int | str,
    *,
    prefix: str | None = None,
    prefix_column: None | int | str = None,
    target_column: str | None = None,
    use_tqdm: bool = False,
    normalize_prefixes_: bool = True,
) -> None:
    """Convert a column of local unique identifiers to CURIEs.

    :param df: A dataframe
    :param column: A column in the dataframe containing identifiers
    :param prefix:
        Specify the prefix if all identifiers in the given column are from
        the same namespace
    :param prefix_column:
        Specify the ``prefix_column`` if there is an additional column whose rows
        contain the prefix for each rows' respective identifiers.
    :param target_column:
        If given, stores CURIEs in this column,
    :param use_tqdm:
        Should a progress bar be shown?
    :param normalize_prefixes_:
        Should the prefix column get auto-normalized if ``prefix_column`` is not None?
    :raises PrefixLocationError:
        If not exactly one of the prefix and prefix_column arguments are given
    :raises ValueError:
        If the given prefix is not normalizable

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # Use a combination of column 1 (DB) and column 2 (DB Object ID) for conversion
        brpd.identifiers_to_curies(df, column=1, prefix_column=0)
    """
    # FIXME do pattern check first so you don't get bananas
    column = _norm_column(df, column)
    if prefix_column is None and prefix is None:
        raise PrefixLocationError
    elif prefix_column is not None and prefix is not None:
        raise PrefixLocationError

    # valid_idx = validate_identifiers(df, column=column, prefix=prefix, prefix_column=prefix_column)
    target_column = target_column or column

    if prefix is not None:
        norm_prefix = bioregistry.normalize_prefix(prefix)
        if norm_prefix is None:
            raise ValueError

        df.loc[target_column] = df[column].map(
            functools.partial(bioregistry.curie_to_str, prefix=norm_prefix),
            na_action="ignore",
        )
    elif prefix_column is not None:
        prefix_column = _norm_column(df, prefix_column)
        if normalize_prefixes_:
            normalize_prefixes(df=df, column=prefix_column)
        df[target_column] = _multi_column_map(
            df, [prefix_column, column], bioregistry.curie_to_str, use_tqdm=use_tqdm
        )




[docs]
def identifiers_to_iris(
    df: pd.DataFrame,
    column: int | str,
    *,
    prefix: str,
    prefix_column: str | None = None,
    target_column: str | None = None,
    use_tqdm: bool = False,
) -> None:
    """Convert a column of local unique identifiers to IRIs.

    :param df: A dataframe
    :param column: A column in the dataframe containing identifiers
    :param prefix:
        Specify the prefix if all identifiers in the given column are from
        the same namespace
    :param prefix_column:
        Specify the ``prefix_column`` if there is an additional column whose rows
        contain the prefix for each rows' respective identifiers.
    :param target_column:
        If given, stores IRIs in this column
    :param use_tqdm:
        Should a progress bar be shown?

    :raises PrefixLocationError:
        If not exactly one of the prefix and prefix_column arguments are given
    :raises ValueError:
        If the given prefix is not normalizable

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # Use a combination of column 1 (DB) and column 2 (DB Object ID) for conversion
        brpd.identifiers_to_iris(df, column=1, prefix_column=0)
    """
    column = _norm_column(df, column)
    if prefix_column is None and prefix is None:
        raise PrefixLocationError
    elif prefix_column is not None and prefix is not None:
        raise PrefixLocationError
    elif prefix is not None:
        norm_prefix = bioregistry.normalize_prefix(prefix)
        if norm_prefix is None:
            raise ValueError
        df[target_column or column] = df[column].map(
            functools.partial(bioregistry.get_iri, prefix=norm_prefix), na_action="ignore"
        )
    else:  # prefix_column is not None
        prefix_column = _norm_column(df, prefix_column)
        df[target_column or column] = _multi_column_map(
            df, [prefix_column, column], bioregistry.get_iri, use_tqdm=use_tqdm
        )



def _multi_column_map(
    df: pd.DataFrame,
    columns: list[str],
    func: Callable[..., X],
    *,
    use_tqdm: bool = False,
) -> pd.Series[X]:  # type:ignore[type-var]
    rows = df[columns].values
    return pd.Series(
        [
            func(*row) if all(pd.notna(cell) for cell in row) else None
            for row in tqdm(rows, unit_scale=True, disable=not use_tqdm)
        ],
        index=df.index,
    )



[docs]
def curies_to_iris(
    df: pd.DataFrame, column: int | str, *, target_column: str | None = None
) -> None:
    """Convert a column of CURIEs to IRIs.

    :param df: A dataframe
    :param column: A column in the dataframe containing CURIEs
    :param target_column:
        If given, stores the IRIs in this column. Otherwise, overwrites the
        given column in place.

    .. seealso:: :func:`iris_to_curies`
    """
    column = _norm_column(df, column)
    df[target_column or column] = df[column].map(bioregistry.get_iri, na_action="ignore")




[docs]
def curies_to_identifiers(
    df: pd.DataFrame,
    column: int | str,
    *,
    target_column: str | None = None,
    prefix_column_name: str | None = None,
) -> None:
    """Split a CURIE column into a prefix and local identifier column.

    By default, the local identifier stays in the same column unless target_column is given.
    If prefix_column_name isn't given, it's derived from the target column (if labels available)
    or just appended to the end if not

    :param df: A dataframe
    :param column: A column in the dataframe containing CURIEs
    :param target_column:
        If given, stores identifiers in this column. Else, stores in the given column
    :param prefix_column_name:
        If given, stores prefixes in this column. Else, derives the column name from the
        target column name.
    :raises ValueError:
        If no prefix_column_name is given and the auto-generated name conflicts with a column
        already in the dataframe.

    .. code-block:: python

        import bioregistry.pandas as brpd
        import pandas as pd

        df = brpd.get_goa_example()

        # column 5: GO ID - convert CURIEs directly to IRIs
        #  i.e., `GO:0003993` becomes `http://amigo.geneontology.org/amigo/term/GO:0003993`
        brpd.curies_to_identifiers(df, column=4)
    """
    column = _norm_column(df, column)
    if target_column is None:
        target_column = column
    if prefix_column_name is None:
        prefix_column_name = f"{target_column}_prefix"
        if prefix_column_name in df.columns:
            raise ValueError(
                "auto-generated prefix column is already present. please specify explicitly."
            )

    series: list[MaybeCURIE] = [
        bioregistry.parse_curie(curie) if pd.notna(curie) else (None, None) for curie in df[column]
    ]
    prefixes, identifiers = zip(*series, strict=False)
    df[prefix_column_name] = prefixes
    df[target_column] = identifiers




[docs]
def iris_to_curies(
    df: pd.DataFrame, column: int | str, *, target_column: str | None = None
) -> None:
    """Convert a column of IRIs to CURIEs.

    :param df: A dataframe
    :param column: A column in the dataframe containing IRIs
    :param target_column:
        If given, stores the CURIEs in this column. Otherwise, overwrites the
        given column in place.

    .. seealso:: :func:`curies_to_iris`
    """
    column = _norm_column(df, column)
    df[target_column or column] = df[column].map(bioregistry.curie_from_iri, na_action="ignore")