Source code for bioregistry.external.bioportal

# -*- coding: utf-8 -*-

"""Download the NCBO BioPortal registry.

Get an API key by logging up, signing in, and navigating to https://bioportal.bioontology.org/account.
"""

import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

import pystow
import requests
from tqdm.contrib.concurrent import thread_map

from bioregistry.constants import EXTERNAL

__all__ = [
    "get_bioportal",
    "get_agroportal",
    "get_ecoportal",
]

BIOPORTAL_BASE_URL = "https://data.bioontology.org"
ECOPORTAL_BASE_URL = "http://ecoportal.lifewatch.eu:8080"
AGROPORTAL_BASE_URL = "http://data.agroportal.lirmm.fr"


@dataclass
class OntoPortalClient:
    """A client for an OntoPortal site, like BioPortal."""

    metaprefix: str
    base_url: str
    api_key: Optional[str] = None
    directory: Path = field(init=False)
    raw_path: Path = field(init=False)
    processed_path: Path = field(init=False)

    def __post_init__(self):
        self.directory = EXTERNAL.joinpath(self.metaprefix)
        self.directory.mkdir(exist_ok=True, parents=True)
        self.raw_path = self.directory.joinpath("raw.json")
        self.processed_path = self.directory.joinpath("processed.json")

    def query(self, url: str, **params) -> requests.Response:
        """Query the given endpoint on the OntoPortal site.

        :param url: URL to query
        :param params: Kwargs to give as params to :func:`requests.get`
        :returns: The response from :func:`requests.get`

        The rate limit is 15 queries per second. See:
        https://www.bioontology.org/wiki/Annotator_Optimizing_and_Troublehooting
        """
        if self.api_key is None:
            self.api_key = pystow.get_config(self.metaprefix, "api_key", raise_on_missing=True)
        params.setdefault("apikey", self.api_key)
        return requests.get(url, params=params)

    def download(self, force_download: bool = False):
        """Get the full dump of the OntoPortal site's registry."""
        if self.processed_path.exists() and not force_download:
            with self.processed_path.open() as file:
                return json.load(file)

        # see https://data.bioontology.org/documentation#Ontology
        res = self.query(self.base_url + "/ontologies", summaryOnly=False, notes=True)
        records = res.json()
        with self.raw_path.open("w") as file:
            json.dump(records, file, indent=2, sort_keys=True)

        records = thread_map(self.process, records, disable=True)
        rv = {result["prefix"]: result for result in records}

        with self.processed_path.open("w") as file:
            json.dump(rv, file, indent=2, sort_keys=True)
        return rv

    def process(self, entry):
        """Process a record from the OntoPortal site's API."""
        prefix = entry["acronym"]
        extra_data = {}
        # res = query(f'{BASE_URL}/ontologies/{bioportal_key}/latest_submission')
        # if res:
        #     extra_data = res.json()
        # else:
        #     print('failed on', bioportal_key)
        #     extra_data = {}

        contact = (extra_data.get("contact") or [{}])[0]
        rv = {
            "prefix": prefix,
            "name": entry["name"],
            "description": extra_data.get("description"),
            "contact": contact.get("email"),
            "contact.label": contact.get("name"),
            "homepage": extra_data.get("homepage"),
            "version": extra_data.get("version"),
            "publication": extra_data.get("publication"),
        }
        return {k: v for k, v in rv.items() if v is not None}


bioportal_client = OntoPortalClient(
    metaprefix="bioportal",
    base_url=BIOPORTAL_BASE_URL,
)


[docs]def get_bioportal(force_download: bool = False):
    """Get the BioPortal registry."""
    return bioportal_client.download(force_download=force_download)


ecoportal_client = OntoPortalClient(
    metaprefix="ecoportal",
    base_url=ECOPORTAL_BASE_URL,
)


def get_ecoportal(force_download: bool = False):
    """Get the EcoPortal registry."""
    return ecoportal_client.download(force_download=force_download)


agroportal_client = OntoPortalClient(
    metaprefix="agroportal",
    base_url=AGROPORTAL_BASE_URL,
)


def get_agroportal(force_download: bool = False):
    """Get the AgroPortal registry."""
    return agroportal_client.download(force_download=force_download)


if __name__ == "__main__":
    print("AgroPortal has", len(get_agroportal(force_download=True)))  # noqa:T201
    print("EcoPortal has", len(get_ecoportal(force_download=True)))  # noqa:T201
    print("BioPortal has", len(get_bioportal(force_download=True)))  # noqa:T201