Source code for bioregistry.align.utils

# -*- coding: utf-8 -*-

"""Utilities for registry alignment."""

import csv
from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping, Optional, Sequence

import click
from tabulate import tabulate

from ..constants import EXTERNAL
from ..resource_manager import Manager
from ..schema import Resource
from ..schema_utils import is_mismatch
from ..utils import norm

__all__ = [
    "Aligner",
]


[docs]class Aligner: """A class for aligning new registries.""" #: The key for the external registry key: ClassVar[str] #: Header to put on the curation table, corresponding to ``get_curation_row()`` curation_header: ClassVar[Sequence[str]] #: The function that gets the external registry as a dictionary from the string identifier to #: the entries (could be anything, but a dictionary is probably best) getter: ClassVar[Callable[..., Mapping[str, Any]]] #: Keyword arguments to pass to the getter function on call getter_kwargs: ClassVar[Optional[Mapping[str, Any]]] = None #: Should new entries be included automatically? Only set this true for aligners of #: very high confidence (e.g., OBO Foundry but not BioPortal) include_new: ClassVar[bool] = False #: Set this if there's another part of the data besides the ID that should be matched alt_key_match: ClassVar[Optional[str]] = None alt_keys_match: ClassVar[Optional[str]] = None #: Set to true if you don't want to align to deprecated resources skip_deprecated: ClassVar[bool] = False subkey: ClassVar[str] = "prefix" normalize_invmap: ClassVar[bool] = False def __init__(self, force_download: Optional[bool] = None): """Instantiate the aligner.""" if not hasattr(self.__class__, "key"): raise TypeError if not hasattr(self.__class__, "curation_header"): raise TypeError self.manager = Manager() if self.key not in self.manager.metaregistry: raise TypeError(f"invalid metaprefix for aligner: {self.key}") kwargs = dict(self.getter_kwargs or {}) kwargs.setdefault("force_download", True) if force_download is not None: kwargs["force_download"] = force_download self.external_registry = self.__class__.getter(**kwargs) self.skip_external = self.get_skip() # Get all of the pre-curated mappings from the Bioregistry self.external_id_to_bioregistry_id = self.manager.get_registry_invmap( self.key, normalize=self.normalize_invmap, ) # Run lexical alignment self._align() @property def internal_registry(self) -> Dict[str, Resource]: """Get the internal registry.""" return self.manager.registry
[docs] def get_skip(self) -> Mapping[str, str]: """Get the mapping prefixes that should be skipped to their reasons (strings).""" return {}
def _align(self): """Align the external registry.""" for external_id, external_entry in sorted(self.external_registry.items()): if external_id in self.skip_external: continue bioregistry_id = self.external_id_to_bioregistry_id.get(external_id) # There's already a mapping for this external ID to a bioregistry # entry. Just add all of the latest metadata and move on if bioregistry_id is not None: self._align_action(bioregistry_id, external_id, external_entry) continue # try to lookup with lexical match if not self.alt_key_match: bioregistry_id = self.manager.normalize_prefix(external_id) else: alt_match = external_entry.get(self.alt_key_match) if alt_match: bioregistry_id = self.manager.normalize_prefix(alt_match) if bioregistry_id is None and self.alt_keys_match: for alt_match in external_entry.get(self.alt_keys_match, []): bioregistry_id = self.manager.normalize_prefix(alt_match) if bioregistry_id: break # A lexical match was possible if bioregistry_id is not None: # check this external ID for curated mismatches, and move # on if one has already been curated if is_mismatch(bioregistry_id, self.key, external_id): continue if self.skip_deprecated and self.manager.is_deprecated(bioregistry_id): continue self._align_action(bioregistry_id, external_id, external_entry) continue # add the identifier from an external resource if it's been marked as high quality elif self.include_new: bioregistry_id = norm(external_id) if is_mismatch(bioregistry_id, self.key, external_id): continue self.internal_registry[bioregistry_id] = Resource(prefix=bioregistry_id) self._align_action(bioregistry_id, external_id, external_entry) continue def _align_action( self, bioregistry_id: str, external_id: str, external_entry: Dict[str, Any] ) -> None: if self.internal_registry[bioregistry_id].mappings is None: self.internal_registry[bioregistry_id].mappings = {} self.internal_registry[bioregistry_id].mappings[self.key] = external_id # type:ignore _entry = self.prepare_external(external_id, external_entry) _entry[self.subkey] = external_id self.internal_registry[bioregistry_id][self.key] = _entry self.external_id_to_bioregistry_id[external_id] = bioregistry_id
[docs] def prepare_external(self, external_id: str, external_entry: Dict[str, Any]) -> Dict[str, Any]: """Prepare a dictionary to be added to the bioregistry for each external registry entry. The default implementation returns `external_entry` unchanged. If you need more than that, override this method. :param external_id: The external registry identifier :param external_entry: The external registry data :return: The dictionary to be added to the bioregistry for the aligned entry """ return external_entry
[docs] def write_registry(self) -> None: """Write the internal registry.""" self.manager.write_registry()
[docs] @classmethod def align( cls, dry: bool = False, show: bool = False, force_download: Optional[bool] = None, ) -> None: """Align and output the curation sheet. :param dry: If true, don't write changes to the registry :param show: If true, print a curation table :param force_download: Force re-download of the data """ instance = cls(force_download=force_download) if not dry: instance.write_registry() if show: instance.print_curation_table() instance.write_curation_table()
[docs] @classmethod def cli(cls): """Construct a CLI for the aligner.""" @click.command() @click.option("--dry", is_flag=True, help="if set, don't write changes to the registry") @click.option("--show", is_flag=True, help="if set, print a curation table") @click.option( "--no-force", is_flag=True, help="if set, do not force re-downloading the data" ) def _main(dry: bool, show: bool, no_force: bool): cls.align(dry=dry, show=show, force_download=not no_force) _main()
[docs] def get_curation_row(self, external_id, external_entry) -> Sequence[str]: """Get a sequence of items that will be ech row in the curation table. :param external_id: The external registry identifier :param external_entry: The external registry data :return: A sequence of cells to add to the curation table. :raises TypeError: If an invalid value is encountered The default implementation of this function iterates over all of the keys in the class variable :data:`curation_header` and looks inside each record for those in order. .. note:: You don't need to pass the external ID. this will automatically be the first element. """ # noqa:DAR202 rv = [] for k in self.curation_header: value = external_entry.get(k) if value is None: rv.append("") elif isinstance(value, str): rv.append(value.strip()) elif isinstance(value, bool): rv.append("true" if value else "false") elif isinstance(value, (list, tuple, set)): rv.append("|".join(sorted(v.strip() for v in value))) else: raise TypeError(f"unexpected type in curation header: {value}") return rv
def _iter_curation_rows(self) -> Iterable[Sequence[str]]: for external_id, external_entry in sorted( self.external_registry.items(), key=lambda s: (s[0].casefold(), s[0]) ): if external_id in self.skip_external: continue bioregistry_id = self.external_id_to_bioregistry_id.get(external_id) if bioregistry_id is None: yield ( external_id, *self.get_curation_row(external_id, external_entry), )
[docs] def write_curation_table(self) -> None: """Write the curation table to a TSV.""" path = EXTERNAL.joinpath(self.key, "curation.tsv") rows = list(self._iter_curation_rows()) if not rows: if path.is_file(): path.unlink() return path.parent.mkdir(exist_ok=True, parents=True) with path.open("w") as file: writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) writer.writerow((self.subkey, *self.curation_header)) writer.writerows(rows)
[docs] def get_curation_table(self, **kwargs) -> Optional[str]: """Get the curation table as a string, built by :mod:`tabulate`.""" kwargs.setdefault("tablefmt", "rst") headers = (self.subkey, *self.curation_header) rows = list(self._iter_curation_rows()) if not rows: return None return tabulate( rows, headers=headers, **kwargs, )
[docs] def print_curation_table(self, **kwargs) -> None: """Print the curation table.""" s = self.get_curation_table(**kwargs) if s: print(s) # noqa:T201