Source code for bioregistry.external.ontobee

"""Download registry information from OntoBee."""

import json
import textwrap
from collections.abc import Sequence
from pathlib import Path
from typing import Any, ClassVar

from bs4 import BeautifulSoup
from pystow.utils import download

from bioregistry.constants import RAW_DIRECTORY
from bioregistry.external.alignment_utils import Aligner, load_processed

__all__ = [
    "OntobeeAligner",
    "get_ontobee",
]

DIRECTORY = Path(__file__).parent.resolve()
RAW_PATH = RAW_DIRECTORY / "ontobee.html"
PROCESSED_PATH = DIRECTORY / "processed.json"

URL = "http://www.ontobee.org/"
LEGEND = {
    "F": "Foundry",
    "L": "Library",
    "N": "Not Specified/No",
}



[docs]
def get_ontobee(force_download: bool = False) -> dict[str, dict[str, Any]]:
    """Get the OntoBee registry."""
    if PROCESSED_PATH.exists() and not force_download:
        return load_processed(PROCESSED_PATH)

    download(url=URL, path=RAW_PATH, force=True)
    with RAW_PATH.open() as f:
        soup = BeautifulSoup(f, "html.parser")

    ontology_list = soup.find(id="ontologyList")
    if ontology_list is None:
        raise ValueError
    table_body = ontology_list.find("tbody")
    if table_body is None:
        raise ValueError

    rv = {}
    for row in table_body.find_all("tr"):  # type:ignore
        cells = row.find_all("td")
        prefix = cells[1].text
        rv[prefix] = {
            "name": cells[2].text,
            "library": LEGEND[cells[3].text.upper()],
            # "link": cells[1].find("a").attrs["href"],
        }

    with PROCESSED_PATH.open("w") as file:
        json.dump(rv, file, indent=2, sort_keys=True)

    return rv



class OntobeeAligner(Aligner):
    """Aligner for OntoBee xref registry."""

    key = "ontobee"
    getter = get_ontobee
    curation_header: ClassVar[Sequence[str]] = ("name", "url")

    def get_curation_row(self, external_id: str, external_entry: dict[str, Any]) -> Sequence[str]:
        """Return the relevant fields from an OntoBee entry for pretty-printing."""
        return [
            textwrap.shorten(external_entry["name"], 50),
            external_entry.get("url", ""),
        ]


if __name__ == "__main__":
    OntobeeAligner.cli()