Source code for bioregistry.external.bartoc

"""Download the BARTOC registry."""

import json
from collections.abc import Sequence
from pathlib import Path
from typing import Any, ClassVar

import requests
from tqdm import tqdm

from bioregistry.constants import URI_FORMAT_KEY
from bioregistry.license_standardizer import standardize_license

from ..alignment_utils import Aligner, load_processed

__all__ = [
    "BartocAligner",
    "get_bartoc",
]

HERE = Path(__file__).parent.resolve()
PROCESSED_PATH = HERE / "processed.json"
URL = "https://bartoc.org/data/dumps/latest.ndjson"


[docs] def get_bartoc(*, force_download: bool = True) -> dict[str, dict[str, Any]]: """Get the BARTOC registry. :param force_download: If true, forces download. If false and the file is already cached, reuses it. :returns: The BARTOC registry .. seealso:: https://bartoc.org/ """ if PROCESSED_PATH.is_file() and not force_download: return load_processed(PROCESSED_PATH) rv = {} for line in requests.get(URL, timeout=15).iter_lines(): record = json.loads(line) record = _process_bartoc_record(record) rv[record["prefix"]] = record PROCESSED_PATH.write_text(json.dumps(rv, indent=2, ensure_ascii=False, sort_keys=True)) return rv
def _process_bartoc_record(record: dict[str, Any]) -> dict[str, Any]: prefix = record["uri"][len("http://bartoc.org/en/node/") :] rv = { "prefix": prefix, "description": record.get("definition", {}).get("en", [""])[0].strip('"').strip(), "homepage": record.get("url", "").strip(), "name": record.get("prefLabel", {}).get("en", "").strip(), } pattern = record.get("notationPattern") if pattern: rv["pattern"] = "^" + pattern.strip().lstrip("^").rstrip("$") + "$" for identifier in record.get("identifier", []): if identifier.startswith("http://www.wikidata.org/entity/"): rv["wikidata_database"] = identifier[len("http://www.wikidata.org/entity/") :] abbreviations = record.get("notation") if abbreviations: if len(abbreviations) > 1: tqdm.write(f"[bartoc:{prefix}] got multiple abbr.: {abbreviations}") abbreviation = abbreviations[0].strip() if " " in abbreviation: tqdm.write(f"[bartoc:{prefix}] space in abbr.: {abbreviation}") rv["abbreviation"] = abbreviation for license_dict in record.get("license", []): license_key = standardize_license(license_dict["uri"].strip()) if license_key: rv["license"] = license_key uri_pattern = record.get("uriPattern") if uri_pattern and "(" in uri_pattern and ")" in uri_pattern: left_pos = uri_pattern.find("(") right_pos = uri_pattern.find(")") rv[URI_FORMAT_KEY] = uri_pattern[:left_pos] + "$1" + uri_pattern[1 + right_pos :] return {k: v for k, v in rv.items() if k and v} class BartocAligner(Aligner): """Aligner for BARTOC.""" key = "bartoc" getter = get_bartoc alt_key_match = "abbreviation" curation_header: ClassVar[Sequence[str]] = ["name", "homepage", "description"] if __name__ == "__main__": BartocAligner.cli()