# -*- coding: utf-8 -*-
"""Utilities for registry alignment."""
from abc import ABC, abstractmethod
from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping, Optional, Sequence
import click
from tabulate import tabulate
from ..constants import EXTERNAL
from ..resource_manager import Manager
from ..schema import Resource
from ..schema_utils import is_mismatch, read_metaregistry
from ..utils import norm
__all__ = [
"Aligner",
]
[docs]class Aligner(ABC):
"""A class for aligning new registries."""
#: The key for the external registry
key: ClassVar[str]
#: Header to put on the curation table, corresponding to ``get_curation_row()``
curation_header: ClassVar[Sequence[str]]
#: The function that gets the external registry as a dictionary from the string identifier to
#: the entries (could be anything, but a dictionary is probably best)
getter: ClassVar[Callable[..., Mapping[str, Any]]]
#: Keyword arguments to pass to the getter function on call
getter_kwargs: ClassVar[Optional[Mapping[str, Any]]] = None
#: Should new entries be included automatically? Only set this true for aligners of
#: very high confidence (e.g., OBO Foundry but not BioPortal)
include_new: ClassVar[bool] = False
#: Set this if there's another part of the data besides the ID that should be matched
alt_key_match: ClassVar[Optional[str]] = None
#: Set to true if you don't want to align to deprecated resources
skip_deprecated: ClassVar[bool] = False
subkey: ClassVar[str] = "prefix"
normalize_invmap: ClassVar[bool] = False
def __init__(self):
"""Instantiate the aligner."""
if self.key not in read_metaregistry():
raise TypeError(f"invalid metaprefix for aligner: {self.key}")
self.manager = Manager()
kwargs = self.getter_kwargs or {}
kwargs.setdefault("force_download", True)
self.external_registry = self.__class__.getter(**kwargs)
self.skip_external = self.get_skip()
# Get all of the pre-curated mappings from the Bioregistry
self.external_id_to_bioregistry_id = self.manager.get_registry_invmap(
self.key,
normalize=self.normalize_invmap,
)
# Run lexical alignment
self._align()
@property
def internal_registry(self) -> Dict[str, Resource]:
"""Get the internal registry."""
return self.manager.registry
[docs] def get_skip(self) -> Mapping[str, str]:
"""Get the mapping prefixes that should be skipped to their reasons (strings)."""
return {}
def _align(self):
"""Align the external registry."""
for external_id, external_entry in sorted(self.external_registry.items()):
if external_id in self.skip_external:
continue
bioregistry_id = self.external_id_to_bioregistry_id.get(external_id)
# There's already a mapping for this external ID to a bioregistry
# entry. Just add all of the latest metadata and move on
if bioregistry_id is not None:
self._align_action(bioregistry_id, external_id, external_entry)
continue
# try to lookup with lexical match
if not self.alt_key_match:
bioregistry_id = self.manager.normalize_prefix(external_id)
else:
alt_match = external_entry.get(self.alt_key_match)
if alt_match:
bioregistry_id = self.manager.normalize_prefix(alt_match)
# A lexical match was possible
if bioregistry_id is not None:
# check this external ID for curated mismatches, and move
# on if one has already been curated
if is_mismatch(bioregistry_id, self.key, external_id):
continue
if self.skip_deprecated and self.manager.is_deprecated(bioregistry_id):
continue
self._align_action(bioregistry_id, external_id, external_entry)
continue
# add the identifier from an external resource if it's been marked as high quality
elif self.include_new:
bioregistry_id = norm(external_id)
if is_mismatch(bioregistry_id, self.key, external_id):
continue
self.internal_registry[bioregistry_id] = Resource(prefix=bioregistry_id)
self._align_action(bioregistry_id, external_id, external_entry)
continue
def _align_action(
self, bioregistry_id: str, external_id: str, external_entry: Dict[str, Any]
) -> None:
if self.internal_registry[bioregistry_id].mappings is None:
self.internal_registry[bioregistry_id].mappings = {}
self.internal_registry[bioregistry_id].mappings[self.key] = external_id # type:ignore
_entry = self.prepare_external(external_id, external_entry)
_entry[self.subkey] = external_id
self.internal_registry[bioregistry_id][self.key] = _entry
self.external_id_to_bioregistry_id[external_id] = bioregistry_id
[docs] def prepare_external(self, external_id: str, external_entry: Dict[str, Any]) -> Dict[str, Any]:
"""Prepare a dictionary to be added to the bioregistry for each external registry entry.
The default implementation returns `external_entry` unchanged.
If you need more than that, override this method.
:param external_id: The external registry identifier
:param external_entry: The external registry data
:return: The dictionary to be added to the bioregistry for the aligned entry
"""
return external_entry
[docs] def write_registry(self) -> None:
"""Write the internal registry."""
self.manager.write_registry()
[docs] @classmethod
def align(cls, dry: bool = False, show: bool = False):
"""Align and output the curation sheet."""
instance = cls()
if not dry:
instance.write_registry()
if show:
instance.print_curation_table()
instance.write_curation_table()
[docs] @classmethod
def cli(cls):
"""Construct a CLI for the aligner."""
@click.command()
@click.option("--dry", is_flag=True)
@click.option("--show", is_flag=True)
def _main(dry: bool, show: bool):
cls.align(dry=dry, show=show)
_main()
[docs] @abstractmethod
def get_curation_row(self, external_id, external_entry) -> Sequence[str]:
"""Get a sequence of items that will be ech row in the curation table.
:param external_id: The external registry identifier
:param external_entry: The external registry data
:return: A sequence of cells to add to the curation table.
.. note:: You don't need to pass the external ID. this will automatically be the first element.
""" # noqa:DAR202
def _iter_curation_rows(self) -> Iterable[Sequence[str]]:
for external_id, external_entry in sorted(
self.external_registry.items(), key=lambda s: s[0].casefold()
):
if external_id in self.skip_external:
continue
bioregistry_id = self.external_id_to_bioregistry_id.get(external_id)
if bioregistry_id is None:
yield (
external_id,
*self.get_curation_row(external_id, external_entry),
)
[docs] def write_curation_table(self) -> None:
"""Write the curation table to a TSV."""
path = EXTERNAL.joinpath(self.key, "curation.tsv")
rows = list(self._iter_curation_rows())
if not rows:
if path.is_file():
path.unlink()
return
path.parent.mkdir(exist_ok=True, parents=True)
with path.open("w") as file:
print(self.subkey, *self.curation_header, sep="\t", file=file) # noqa:T201
for row in rows:
print(*row, sep="\t", file=file) # noqa:T201
[docs] def get_curation_table(self, **kwargs) -> Optional[str]:
"""Get the curation table as a string, built by :mod:`tabulate`."""
kwargs.setdefault("tablefmt", "rst")
headers = (self.subkey, *self.curation_header)
rows = list(self._iter_curation_rows())
if not rows:
return None
return tabulate(
rows,
headers=headers,
**kwargs,
)
[docs] def print_curation_table(self, **kwargs) -> None:
"""Print the curation table."""
s = self.get_curation_table(**kwargs)
if s:
print(s) # noqa:T201