Source code for bioregistry.external.miriam

# -*- coding: utf-8 -*-

"""Download registry information from Identifiers.org/MIRIAMs."""

import json
from operator import itemgetter
from pathlib import Path

from pystow.utils import download

from bioregistry.constants import RAW_DIRECTORY, URI_FORMAT_KEY
from bioregistry.external.alignment_utils import Aligner

__all__ = [
    "get_miriam",
    "MiriamAligner",
]

DIRECTORY = Path(__file__).parent.resolve()
RAW_PATH = RAW_DIRECTORY / "miriam.json"
PROCESSED_PATH = DIRECTORY / "processed.json"
MIRIAM_URL = "https://registry.api.identifiers.org/resolutionApi/getResolverDataset"
SKIP = {
    "merops",
    "hgnc.family",
    # Appear to be unreleased records
    "f82a1a",
    "4503",
    "6vts",
}
SKIP_URI_FORMATS = {
    "http://arabidopsis.org/servlets/TairObject?accession=$1",
}


[docs] def get_miriam(force_download: bool = False, force_process: bool = False): """Get the MIRIAM registry.""" if PROCESSED_PATH.exists() and not force_download and not force_process: with PROCESSED_PATH.open() as file: return json.load(file) download(url=MIRIAM_URL, path=RAW_PATH, force=force_download) with open(RAW_PATH) as file: data = json.load(file) data["payload"]["namespaces"] = sorted(data["payload"]["namespaces"], key=itemgetter("prefix")) if force_download: with open(RAW_PATH, "w") as file: json.dump(data, file, indent=2, sort_keys=True, ensure_ascii=False) rv = { record["prefix"]: _process(record) for record in data["payload"]["namespaces"] # records whose prefixes start with `dg.` appear to be unreleased if not record["prefix"].startswith("dg.") and record["prefix"] not in SKIP } with PROCESSED_PATH.open("w") as file: json.dump(rv, file, indent=2, sort_keys=True) return rv
#: Pairs of MIRIAM prefix and provider codes to skip PROVIDER_BLACKLIST = { ("ega.study", "omicsdi"), # see discussion at https://github.com/biopragmatics/bioregistry/pull/944 ("bioproject", "ebi"), ("pmc", "ncbi"), } def _process(record): prefix = record["prefix"] rv = { "prefix": prefix, "id": record["mirId"][len("MIR:") :], "name": record["name"], "deprecated": record["deprecated"], "namespaceEmbeddedInLui": record["namespaceEmbeddedInLui"], "sampleId": record["sampleId"], "description": record["description"], "pattern": record["pattern"], } resources = [ _preprocess_resource(resource) for resource in record.get("resources", []) if not resource.get("deprecated") ] if not resources: return rv has_official = any(resource["official"] for resource in resources) if has_official: primary = next(resource for resource in resources if resource["official"]) rest = [resource for resource in resources if not resource["official"]] else: primary, *rest = resources rv["homepage"] = primary["homepage"] if URI_FORMAT_KEY in primary: rv[URI_FORMAT_KEY] = primary[URI_FORMAT_KEY] extras = [] for provider in rest: code = provider["code"] if code in SKIP_PROVIDERS or (prefix, code) in PROVIDER_BLACKLIST: continue del provider["official"] extras.append(provider) if extras: rv["providers"] = extras return rv SKIP_PROVIDERS = { "ols", # handled by the Bioregistry's metaregistry "bptl", # handled by the Bioregistry's metaregistry "bioentitylink", } def _preprocess_resource(resource): rv = { "official": resource["official"], "homepage": resource["resourceHomeUrl"], "code": resource["providerCode"], "name": resource["name"], "description": resource["description"], } uri_format = resource["urlPattern"].replace("{$id}", "$1") if uri_format not in SKIP_URI_FORMATS: rv[URI_FORMAT_KEY] = uri_format return rv class MiriamAligner(Aligner): """Aligner for the MIRIAM registry.""" key = "miriam" getter = get_miriam curation_header = ("deprecated", "name", "description") include_new = True if __name__ == "__main__": MiriamAligner.cli()