Source code for bioregistry.external.cellosaurus

# -*- coding: utf-8 -*-

"""Download the Cellosaurus registry."""

import itertools as itt
import json

from pystow.utils import download

from bioregistry.constants import EXTERNAL, URI_FORMAT_KEY

URL = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus_xrefs.txt"

DIRECTORY = EXTERNAL / "cellosaurus"
DIRECTORY.mkdir(exist_ok=True, parents=True)
RAW_PATH = DIRECTORY / "raw.txt"
PROCESSED_PATH = DIRECTORY / "processed.json"
KEYMAP = {
    "Abbrev": "prefix",
    "Cat": "category",
    "Db_URL": URI_FORMAT_KEY,
    "Name": "name",
    "Server": "homepage",
}


[docs]def get_cellosaurus(force_download: bool = False, keep_missing_uri: bool = True):
    """Get the Cellosaurus registry."""
    if PROCESSED_PATH.exists() and not force_download:
        with PROCESSED_PATH.open() as file:
            return json.load(file)

    download(url=URL, path=RAW_PATH, force=True)
    with RAW_PATH.open(encoding="ISO8859-1") as file:
        lines = [line.rstrip() for line in file]

    # Get up until the third big line break and chomp two extra lines
    # for the line break
    break_line_idxs = [i for i, line in enumerate(lines) if line.startswith("------")]
    lines = lines[break_line_idxs[3] + 2 :]

    rv = {}
    for cond, slines in itt.groupby(lines, lambda line: line == "//"):
        if cond:
            continue
        d = {}
        for line in slines:
            if line[6] != ":":  # strip notes out
                continue
            key, value = (s.strip() for s in line.split(":", 1))
            mapped_key = KEYMAP.get(key)
            if mapped_key is None:
                continue
            if mapped_key == URI_FORMAT_KEY:
                value = _process_db_url(value)
                if value is None:
                    continue
            d[mapped_key] = value
        if not keep_missing_uri and URI_FORMAT_KEY not in d:
            continue
        rv[d.pop("prefix")] = d

    with PROCESSED_PATH.open("w") as file:
        json.dump(rv, file, indent=2, sort_keys=True)

    return rv


def _process_db_url(value):
    if value in {"https://%s", "None"}:
        return
    return value.rstrip("/").replace("%s", "$1")


if __name__ == "__main__":
    print(len(get_cellosaurus(force_download=True)))  # noqa:T201