Source code for bioregistry.external.cellosaurus

"""Download the Cellosaurus registry."""

from __future__ import annotations

import itertools as itt
import json
import logging
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar

from pystow.utils import download

from bioregistry.constants import RAW_DIRECTORY, URI_FORMAT_KEY
from bioregistry.external.alignment_utils import Aligner, load_processed

__all__ = [
    "CellosaurusAligner",
    "get_cellosaurus",
]

logger = logging.getLogger(__name__)

URL = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus_xrefs.txt"
DIRECTORY = Path(__file__).parent.resolve()
RAW_PATH = RAW_DIRECTORY / "cellosaurus.txt"
PROCESSED_PATH = DIRECTORY / "processed.json"
KEYMAP = {
    "Abbrev": "prefix",
    "Cat": "category",
    "Db_URL": URI_FORMAT_KEY,
    "Name": "name",
    "Server": "homepage",
}


[docs] def get_cellosaurus( force_download: bool = False, keep_missing_uri: bool = True ) -> dict[str, dict[str, Any]]: """Get the Cellosaurus registry.""" if PROCESSED_PATH.exists() and not force_download: return load_processed(PROCESSED_PATH) download(url=URL, path=RAW_PATH, force=True) with RAW_PATH.open(encoding="ISO8859-1") as file: lines = [line.rstrip() for line in file] # Get up until the third big line break and chomp two extra lines # for the line break break_line_idxs = [i for i, line in enumerate(lines) if line.startswith("------")] lines = lines[break_line_idxs[3] + 2 :] rv = {} for cond, slines in itt.groupby(lines, lambda line: line == "//"): if cond: continue d: dict[str, str] = {} for line in slines: if line[6] != ":": # strip notes out continue key, value = (s.strip() for s in line.split(":", 1)) mapped_key = KEYMAP.get(key) if mapped_key is None: continue if mapped_key == URI_FORMAT_KEY: value_tmp = _process_db_url(d["prefix"], value) if value_tmp is None: continue else: value = value_tmp d[mapped_key] = value if not keep_missing_uri and URI_FORMAT_KEY not in d: continue rv[d.pop("prefix")] = d with PROCESSED_PATH.open("w") as file: json.dump(rv, file, indent=2, sort_keys=True) return rv
def _process_db_url(key: str, value: str) -> str | None: if value in {"https://%s", "None"}: return None if value.endswith("http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252F%s") or value.endswith( "http://purl.obolibrary.org/obo/%s" ): logger.debug( "Cellosaurus curated an OBO PURL for `%s` that is is missing namespace. " "See discussion at https://github.com/biopragmatics/bioregistry/issues/1259.", key, ) return None return value.rstrip("/").replace("%s", "$1") class CellosaurusAligner(Aligner): """Aligner for the Cellosaurus.""" key = "cellosaurus" getter = get_cellosaurus curation_header: ClassVar[Sequence[str]] = ["name", "homepage", "category", URI_FORMAT_KEY] def get_skip(self) -> Mapping[str, str]: """Get the skipped Cellosaurus identifiers.""" return { "CCTCC": "dead site", "CCLV": "stub website, URL dead", } if __name__ == "__main__": CellosaurusAligner.cli()