Source code for bioregistry.external.prefixcommons

# -*- coding: utf-8 -*-

"""Download registry information from the Life Science Registry (LSR), which powers Prefix Commons.

.. seealso::

    - which should expand to

import json
from typing import Any, Dict

from pystow.utils import download

from bioregistry.constants import EXTERNAL

__all__ = [

DIRECTORY = EXTERNAL / "prefixcommons"
DIRECTORY.mkdir(exist_ok=True, parents=True)
RAW_PATH = DIRECTORY / "raw.tsv"
PROCESSED_PATH = DIRECTORY / "processed.json"
URL = f"{GOOGLE_DOCUMENT_ID}/export?format=tsv&gid=0"
    "prefix",  # "Preferred Prefix",
    "Provider Base URI",
    "Alternative Base URI",
    "bioportal",  # "BioPortal Ontology ID",
    "miriam",  # "",
    "name",  # originally: Title,
    "description",  # "Description",
    "pubmed_ids",  # "PubMed ID"
    "Type (warehouse, dataset or terminology)",
    "homepage",  # "Homepage",
    "sub-namespace in dataset",
    "part of collection",
    "License URL",
    "License Text",
    "pattern",  # "ID regex",
    "example",  # "ExampleID",
    "uri_format",  # "Provider HTML URL",
    "MIRIAM checked",
    "MIRIAM curator notes",
    "MIRIAM coverage",
    "year last accessible",
    "wayback url",
    "last updated",
    "last updated by",
    "last updated by (orcid)",
KEEP = {

[docs]def get_prefixcommons(force_download: bool = False): """Get the Life Science Registry.""" if PROCESSED_PATH.exists() and not force_download: with as file: return json.load(file) download(url=URL, path=RAW_PATH, force=True) rows = {} with as file: lines = iter(file) next(lines) # throw away header for line in lines: prefix, data = _process_row(line) if prefix and data: rows[prefix] = data PROCESSED_PATH.write_text(json.dumps(rows, sort_keys=True, indent=2)) return rows
def _process_row(line: str): cells = line.strip().split("\t") cells_processed = [None if cell in {"N/A"} else cell for cell in cells] rv: Dict[str, Any] = { key: value.strip() for key, value in zip(COLUMNS, cells_processed) if key and value and key in KEEP } for key in ["name", "description", "example", "pattern"]: if not rv.get(key): return None, None for key in ["keywords", "pubmed_ids", "synonyms"]: values = rv.get(key) if values: rv[key] = [value.strip() for value in values.split(",")] uri_format = rv.get("uri_format") if uri_format: rv["uri_format"] = ( uri_format.replace("$id", "$1").replace("[?id]", "$1").replace("$d", "$1") ) pattern = rv.get("pattern") if pattern: if not pattern.startswith("^"): pattern = f"^{pattern}" if not pattern.endswith("$"): pattern = f"{pattern}$" rv["pattern"] = pattern return cells[0], rv if __name__ == "__main__": print(len(get_prefixcommons(force_download=True))) # noqa:T201