Source code for bioregistry.external.bartoc

"""Download the BARTOC registry."""

import json
from collections.abc import Sequence
from pathlib import Path
from typing import Any, ClassVar

import requests
from tqdm import tqdm

from bioregistry.constants import URI_FORMAT_KEY
from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.license_standardizer import standardize_license

__all__ = [
    "BartocAligner",
    "get_bartoc",
]

HERE = Path(__file__).parent.resolve()
PROCESSED_PATH = HERE / "processed.json"
URL = "https://bartoc.org/data/dumps/latest.ndjson"


[docs] def get_bartoc(*, force_download: bool = True) -> dict[str, dict[str, Any]]: """Get the BARTOC registry. :param force_download: If true, forces download. If false and the file is already cached, reuses it. :returns: The BARTOC registry .. seealso:: https://bartoc.org/ """ if PROCESSED_PATH.is_file() and not force_download: return load_processed(PROCESSED_PATH) rv = {} for line in requests.get(URL, timeout=15).iter_lines(): record = json.loads(line) record = _process_bartoc_record(record) rv[record["prefix"]] = record PROCESSED_PATH.write_text(json.dumps(rv, indent=2, ensure_ascii=False, sort_keys=True)) return rv
URI_FORMAT_SKIPS: dict[str, str] = {} def _process_bartoc_record(record: dict[str, Any]) -> dict[str, Any]: prefix = record["uri"][len("http://bartoc.org/en/node/") :] rv = { "prefix": prefix, "description": record.get("definition", {}).get("en", [""])[0].strip('"').strip(), "homepage": record.get("url", "").strip(), "name": record.get("prefLabel", {}).get("en", "").strip(), } pattern = record.get("notationPattern") if pattern: rv["pattern"] = "^" + pattern.strip().lstrip("^").rstrip("$") + "$" for identifier in record.get("identifier", []): if identifier.startswith("http://www.wikidata.org/entity/"): rv["wikidata_database"] = identifier[len("http://www.wikidata.org/entity/") :] abbreviations = record.get("notation") if abbreviations: if len(abbreviations) > 1: tqdm.write(f"[bartoc:{prefix}] got multiple abbr.: {abbreviations}") abbreviation = abbreviations[0].strip() if " " in abbreviation: tqdm.write(f"[bartoc:{prefix}] space in abbr.: {abbreviation}") rv["abbreviation"] = abbreviation for license_dict in record.get("license", []): license_key = standardize_license(license_dict["uri"].strip()) if license_key: rv["license"] = license_key if prefix in URI_FORMAT_SKIPS: pass elif uri_prefix := record.pop("namespace", None): rv[URI_FORMAT_KEY] = uri_prefix.strip() + "$1" elif uri_pattern := record.get("uriPattern"): if "(" not in uri_pattern and ")" not in uri_pattern: tqdm.write(f"bad URI pattern: {uri_pattern}, assuming is URI prefix") rv[URI_FORMAT_KEY] = uri_pattern.strip() + "$1" else: left_pos = uri_pattern.find("(") right_pos = uri_pattern.find(")") rv[URI_FORMAT_KEY] = uri_pattern[:left_pos] + "$1" + uri_pattern[1 + right_pos :] if examples := record.pop("EXAMPLES", []): rv["example"] = examples[0].strip() return {k: v for k, v in rv.items() if k and v} class BartocAligner(Aligner): """Aligner for BARTOC.""" key = "bartoc" getter = get_bartoc alt_key_match = "abbreviation" curation_header: ClassVar[Sequence[str]] = ["name", "homepage", "description"] if __name__ == "__main__": BartocAligner.cli()