Source code for bioregistry.external.n2t

"""Download registry information from N2T."""

from __future__ import annotations

import json
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar

import yaml
from pystow.utils import download

from bioregistry.constants import RAW_DIRECTORY, URI_FORMAT_KEY
from bioregistry.external.alignment_utils import Aligner, load_processed

__all__ = [
    "N2TAligner",
    "get_n2t",
]

URL = "https://n2t.net/e/n2t_full_prefixes.yaml"
DIRECTORY = Path(__file__).parent.resolve()
RAW_PATH = RAW_DIRECTORY / "n2t.yml"
PROCESSED_PATH = DIRECTORY / "processed.json"
SKIP = {
    "zzztestprefix": "test prefix should not be considered",
    "urn": "too meta",
    "url": "too meta",
    "purl": "too meta",
    "lsid": "too meta",
    "hdl": "paid service, too meta",
    "repec": "irrelevant prefix from economics",
    "merops": "issue with miriam having duplicate prefixes for this resource",  # FIXME
    "hgnc.family": "issue with miriam having duplicate prefixes for this resource",  # FIXME
}
SKIP_URI_FORMATS = {
    "http://arabidopsis.org/servlets/TairObject?accession=$1",
}



[docs]
def get_n2t(force_download: bool = False) -> dict[str, dict[str, Any]]:
    """Get the N2T registry."""
    if PROCESSED_PATH.exists() and not force_download:
        return load_processed(PROCESSED_PATH)

    download(url=URL, path=RAW_PATH, force=True)
    # they give malformed YAML so time to write a new parser
    with RAW_PATH.open() as file:
        data = yaml.safe_load(file)

    rv = {
        key: _process(record)
        for key, record in data.items()
        if record["type"] == "scheme" and "/" not in key and key not in SKIP
    }

    with PROCESSED_PATH.open("w") as file:
        json.dump(rv, file, sort_keys=True, indent=2)
    return rv



def _process(record: dict[str, Any]) -> dict[str, Any]:
    rv = {
        "name": record.get("name"),
        URI_FORMAT_KEY: _get_uri_format(record),
        "description": record.get("description"),
        "homepage": record.get("more"),
        "pattern": record.get("pattern"),
        "example": record.get("test"),
        "namespaceEmbeddedInLui": (record.get("prefixed") == "true"),
    }
    return {k: v for k, v in rv.items() if v is not None}


def _get_uri_format(record: dict[str, Any]) -> str | None:
    raw_redirect: str | None = record.get("redirect")
    if raw_redirect is None:
        return None
    uri_format = raw_redirect.replace("$id", "$1")
    if uri_format in SKIP_URI_FORMATS:
        return None
    return uri_format


class N2TAligner(Aligner):
    """Aligner for the N2T."""

    key = "n2t"
    getter = get_n2t
    curation_header: ClassVar[Sequence[str]] = ("name", "homepage", "description")

    def get_skip(self) -> Mapping[str, str]:
        """Get the prefixes in N2T that should be skipped."""
        return SKIP


if __name__ == "__main__":
    N2TAligner.cli()