"""Utilities for processing tabular data in Pandas dataframes.
The following examples show how the entries in the widely used `Gene Ontology Annotations
<http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/#>`_ database distributed
in the `GAF format <http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/>`_ can
be loaded with :mod:`pandas` then normalized with the Bioregistry. It can be loaded in full
with the :func:`get_goa_example` function.
"""
import functools
import logging
import re
from typing import Dict, Optional, Pattern, Union
import pandas as pd
from tabulate import tabulate
from tqdm.auto import tqdm
import bioregistry
__all__ = [
"get_goa_example",
# Normalization
"normalize_prefixes",
"normalize_curies",
# Validation
"validate_prefixes",
"validate_curies",
"validate_identifiers",
# Conversion
"identifiers_to_curies",
"identifiers_to_iris",
"curies_to_iris",
"curies_to_identifiers",
"iris_to_curies",
]
logger = logging.getLogger(__name__)
class PrefixLocationError(ValueError):
"""Raised when not exactly one of prefix and prefix_column were given."""
[docs]def get_goa_example() -> pd.DataFrame:
"""Get the GOA file."""
return pd.read_csv(
"http://geneontology.org/gene-associations/goa_human.gaf.gz",
sep="\t",
comment="!",
header=None,
)
def _norm_column(df: pd.DataFrame, column: Union[int, str]) -> str:
return column if isinstance(column, str) else df.columns[column]
[docs]def normalize_prefixes(
df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
"""Normalize prefixes in a given column.
:param df: A dataframe
:param column: A column in the dataframe containing prefixes
:param target_column: The target column to put the normalized prefixes. If not given,
overwrites the given ``column`` in place
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# column 1: DB
# i.e., `UniProtKB` becomes `uniprot`
brpd.normalize_prefixes(df, column=0)
"""
column = _norm_column(df, column)
if target_column is None:
target_column = column
df[target_column] = df[column].map(bioregistry.normalize_prefix, na_action="ignore")
[docs]def normalize_curies(
df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
"""Normalize CURIEs in a given column.
:param df: A dataframe
:param column: The column of CURIEs to normalize
:param target_column:
The column to put the normalized CURIEs in. If not given, overwrites the given ``column`` in place.
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# column 5: GO ID - fix normalization of capitalization of prefix,
# i.e., `GO:0003993` becomes `go:0003993`
brpd.normalize_curies(df, column=4)
# column 6: DB:Reference (|DB:Reference) - fix synonym of prefix
# i.e., `PMID:2676709` becomes `pubmed:2676709`
brpd.normalize_curies(df, column=5)
# column 8: With (or) From
# i.e., `GO:0000346` becomes `go:0000346`
brpd.normalize_curies(df, column=7)
# column 13: Taxon(|taxon) - fix synonym of prefix
# i.e., `taxon:9606` becomes `ncbitaxon:9606`
brpd.normalize_curies(df, column=12)
"""
column = _norm_column(df, column)
if target_column is None:
target_column = column
df[target_column] = df[column].map(bioregistry.normalize_curie, na_action="ignore")
[docs]def validate_prefixes(
df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> pd.Series:
"""Validate prefixes in a given column.
:param df: A DataFrame
:param column: The column of prefixes to validate
:param target_column:
The optional column to put the results of validation
:returns:
A pandas series corresponding to the validity of each row
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# column 1: DB
# i.e., `UniProtKB` entries are not standard, and are therefore false
idx = brpd.validate_prefixes(df, column=0)
# Slice the dataframe based on valid and invalid prefixes
valid_prefix_df = df[idx]
invalid_prefix_df = df[~idx]
"""
column = _norm_column(df, column)
results = df[column].map(lambda x: bioregistry.normalize_prefix(x) == x, na_action="ignore")
if target_column:
df[target_column] = results
return results
def summarize_prefix_validation(df: pd.DataFrame, idx: pd.Series) -> None:
"""Provide a summary of prefix validation."""
# TODO add suggestions on what to do next, e.g.:,
# 1. can some be normalized? use normalization function
# 2. slice out invalid content
# 3. make new prefix request to Bioregistry
count = (~idx).sum()
unique = sorted(df[~idx][0].unique())
print( # noqa:T201
f"{count:,} of {len(df.index):,} ({count / len(df.index):.0%})",
"rows with the following prefixes need to be fixed:",
unique,
)
normalizable = {
prefix: norm_prefix
for prefix, norm_prefix in (
(prefix, bioregistry.normalize_prefix(prefix)) for prefix in unique
)
if norm_prefix
}
if normalizable:
print( # noqa:T201
f"The following prefixes could be normalized using normalize_curies():"
f"\n\n{tabulate(normalizable.items(), headers=['raw', 'standardized'], tablefmt='github')}"
)
[docs]def validate_curies(
df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> pd.Series:
"""Validate CURIEs in a given column.
:param df: A DataFrame
:param column: The column of CURIEs to validate
:param target_column:
The optional column to put the results of validation.
:returns:
A pandas series corresponding to the validity of each row
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# column 5: GO ID - fix normalization of capitalization of prefix,
# i.e., `GO:0003993` is not standard and is therefore false
idx = brpd.validate_curies(df, column=4)
# Slice the dataframe
valid_go_df = df[idx]
invalid_go_df = df[~idx]
"""
column = _norm_column(df, column)
results = df[column].map(bioregistry.is_valid_curie, na_action="ignore")
if target_column:
df[target_column] = results
return results
def summarize_curie_validation(df, idx) -> None:
"""Provide a summary of CURIE validation."""
count = (~idx).sum()
unique = sorted(df[~idx][0].unique())
print( # noqa:T201
f"{count:,} of {len(df.index):,} ({count / len(df.index):.0%})",
"rows with the following CURIEs need to be fixed:",
unique,
)
[docs]def validate_identifiers(
df: pd.DataFrame,
column: Union[int, str],
*,
prefix: Optional[str] = None,
prefix_column: Optional[str] = None,
target_column: Optional[str] = None,
use_tqdm: bool = False,
) -> pd.Series:
"""Validate local unique identifiers in a given column.
Some data sources split the prefix and identifier in separate columns,
so you can use the ``prefix_column`` argument instead of the ``prefix``
argument like in the following example with the GO Annotation Database:
:param df: A dataframe
:param column: A column in the dataframe containing identifiers
:param prefix:
Specify the prefix if all identifiers in the given column are from
the same namespace
:param prefix_column:
Specify the ``prefix_column`` if there is an additional column whose rows
contain the prefix for each rows' respective identifiers.
:param target_column:
If given, stores the results of validation in this column
:param use_tqdm:
Should a progress bar be shown?
:returns:
A pandas series corresponding to the validity of each row
:raises PrefixLocationError:
If not exactly one of the prefix and prefix_column arguments are given
:raises ValueError:
If prefix_column is given and it contains no valid prefixes
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# Use a combination of column 1 (DB) and column 2 (DB Object ID) for validation
idx = brpd.validate_identifiers(df, column=1, prefix_column=0)
# Split the dataframe based on valid and invalid identifiers
valid_df = df[idx]
invalid_df = df[~idx]
"""
column = _norm_column(df, column)
if prefix_column is None and prefix is None:
raise PrefixLocationError
elif prefix_column is not None and prefix is not None:
raise PrefixLocationError
elif prefix is not None:
return _help_validate_identifiers(df, column, prefix)
else: # prefix_column is not None
prefixes = df[prefix_column].unique()
if 0 == len(prefixes):
raise ValueError(f"No prefixes found in column {prefix_column}")
if 1 == len(prefixes):
return _help_validate_identifiers(df, column, list(prefixes)[0])
patterns: Dict[str, Optional[Pattern]] = {}
for prefix in df[prefix_column].unique():
if pd.isna(prefix):
continue
pattern = bioregistry.get_pattern(prefix)
patterns[prefix] = re.compile(pattern) if pattern else None
def _validate_lambda(_p: Optional[str], _i: str) -> Optional[bool]:
if _p is None:
return None
_pattern = patterns.get(_p)
if _pattern is None:
return None
return bool(_pattern.fullmatch(_i))
results = _multi_column_map(
df,
[prefix_column, column],
_validate_lambda,
use_tqdm=use_tqdm,
)
if target_column:
df[target_column] = results
return results
def _help_validate_identifiers(df, column, prefix):
norm_prefix = bioregistry.normalize_prefix(prefix)
if norm_prefix is None:
raise ValueError(
f"Can't validate identifiers for {prefix} because it is not in the Bioregistry"
)
pattern = bioregistry.get_pattern(prefix)
if pattern is None:
raise ValueError(
f"Can't validate identifiers for {prefix} because it has no pattern in the Bioregistry"
)
pattern_re = re.compile(pattern)
return df[column].map(
lambda s: bool(pattern_re.fullmatch(s)),
na_action="ignore",
)
[docs]def identifiers_to_curies(
df: pd.DataFrame,
column: Union[int, str],
*,
prefix: Optional[str] = None,
prefix_column: Union[None, int, str] = None,
target_column: Optional[str] = None,
use_tqdm: bool = False,
normalize_prefixes_: bool = True,
) -> None:
"""Convert a column of local unique identifiers to CURIEs.
:param df: A dataframe
:param column: A column in the dataframe containing identifiers
:param prefix:
Specify the prefix if all identifiers in the given column are from
the same namespace
:param prefix_column:
Specify the ``prefix_column`` if there is an additional column whose rows
contain the prefix for each rows' respective identifiers.
:param target_column:
If given, stores CURIEs in this column,
:param use_tqdm:
Should a progress bar be shown?
:param normalize_prefixes_:
Should the prefix column get auto-normalized if ``prefix_column`` is not None?
:raises PrefixLocationError:
If not exactly one of the prefix and prefix_column arguments are given
:raises ValueError:
If the given prefix is not normalizable
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# Use a combination of column 1 (DB) and column 2 (DB Object ID) for conversion
brpd.identifiers_to_curies(df, column=1, prefix_column=0)
"""
# FIXME do pattern check first so you don't get bananas
column = _norm_column(df, column)
if prefix_column is None and prefix is None:
raise PrefixLocationError
elif prefix_column is not None and prefix is not None:
raise PrefixLocationError
# valid_idx = validate_identifiers(df, column=column, prefix=prefix, prefix_column=prefix_column)
target_column = target_column or column
if prefix is not None:
norm_prefix = bioregistry.normalize_prefix(prefix)
if norm_prefix is None:
raise ValueError
df.loc[target_column] = df[column].map(
functools.partial(bioregistry.curie_to_str, prefix=norm_prefix),
na_action="ignore",
)
elif prefix_column is not None:
prefix_column = _norm_column(df, prefix_column)
if normalize_prefixes_:
normalize_prefixes(df=df, column=prefix_column)
df[target_column] = _multi_column_map(
df, [prefix_column, column], bioregistry.curie_to_str, use_tqdm=use_tqdm
)
[docs]def identifiers_to_iris(
df: pd.DataFrame,
column: Union[int, str],
*,
prefix: str,
prefix_column: Optional[str] = None,
target_column: Optional[str] = None,
use_tqdm: bool = False,
) -> None:
"""Convert a column of local unique identifiers to IRIs.
:param df: A dataframe
:param column: A column in the dataframe containing identifiers
:param prefix:
Specify the prefix if all identifiers in the given column are from
the same namespace
:param prefix_column:
Specify the ``prefix_column`` if there is an additional column whose rows
contain the prefix for each rows' respective identifiers.
:param target_column:
If given, stores IRIs in this column
:param use_tqdm:
Should a progress bar be shown?
:raises PrefixLocationError:
If not exactly one of the prefix and prefix_column arguments are given
:raises ValueError:
If the given prefix is not normalizable
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# Use a combination of column 1 (DB) and column 2 (DB Object ID) for conversion
brpd.identifiers_to_iris(df, column=1, prefix_column=0)
"""
column = _norm_column(df, column)
if prefix_column is None and prefix is None:
raise PrefixLocationError
elif prefix_column is not None and prefix is not None:
raise PrefixLocationError
elif prefix is not None:
norm_prefix = bioregistry.normalize_prefix(prefix)
if norm_prefix is None:
raise ValueError
df[target_column or column] = df[column].map(
functools.partial(bioregistry.get_iri, prefix=norm_prefix), na_action="ignore"
)
else: # prefix_column is not None
prefix_column = _norm_column(df, prefix_column)
df[target_column or column] = _multi_column_map(
df, [prefix_column, column], bioregistry.get_iri, use_tqdm=use_tqdm
)
def _multi_column_map(df, columns, func, *, use_tqdm: bool = False):
rows = df[columns].values
if use_tqdm:
rows = tqdm(rows, unit_scale=True)
return pd.Series(
[func(*row) if all(pd.notna(cell) for cell in row) else None for row in rows],
index=df.index,
)
[docs]def curies_to_iris(
df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
"""Convert a column of CURIEs to IRIs.
:param df: A dataframe
:param column: A column in the dataframe containing CURIEs
:param target_column:
If given, stores the IRIs in this column. Otherwise, overwrites the
given column in place.
.. seealso:: :func:`iris_to_curies`
"""
column = _norm_column(df, column)
df[target_column or column] = df[column].map(bioregistry.get_iri, na_action="ignore")
[docs]def curies_to_identifiers(
df: pd.DataFrame,
column: Union[int, str],
*,
target_column: Optional[str] = None,
prefix_column_name: Optional[str] = None,
) -> None:
"""Split a CURIE column into a prefix and local identifier column.
By default, the local identifier stays in the same column unless target_column is given.
If prefix_column_name isn't given, it's derived from the target column (if labels available)
or just appended to the end if not
:param df: A dataframe
:param column: A column in the dataframe containing CURIEs
:param target_column:
If given, stores identifiers in this column. Else, stores in the given column
:param prefix_column_name:
If given, stores prefixes in this column. Else, derives the column name from the
target column name.
:raises ValueError:
If no prefix_column_name is given and the auto-generated name conflicts with a column
already in the dataframe.
.. code-block:: python
import bioregistry.pandas as brpd
import pandas as pd
df = brpd.get_goa_example()
# column 5: GO ID - convert CURIEs directly to IRIs
# i.e., `GO:0003993` becomes `http://amigo.geneontology.org/amigo/term/GO:0003993`
brpd.curies_to_identifiers(df, column=4)
"""
column = _norm_column(df, column)
if target_column is None:
target_column = column
if prefix_column_name is None:
prefix_column_name = f"{target_column}_prefix"
if prefix_column_name in df.columns:
raise ValueError(
"auto-generated prefix column is already present. please specify explicitly."
)
prefixes, identifiers = zip(*df[column].map(bioregistry.parse_curie, na_action="ignore"))
df[prefix_column_name] = prefixes
df[target_column] = identifiers
[docs]def iris_to_curies(
df: pd.DataFrame, column: Union[int, str], *, target_column: Optional[str] = None
) -> None:
"""Convert a column of IRIs to CURIEs.
:param df: A dataframe
:param column: A column in the dataframe containing IRIs
:param target_column:
If given, stores the CURIEs in this column. Otherwise, overwrites the
given column in place.
.. seealso:: :func:`curies_to_iris`
"""
column = _norm_column(df, column)
df[target_column or column] = df[column].map(bioregistry.curie_from_iri, na_action="ignore")