"""Download the NCBO BioPortal registry.
Get an API key by logging up, signing in, and navigating to https://bioportal.bioontology.org/account.
"""
import json
import math
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional
import pystow
import requests
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
from bioregistry.constants import EMAIL_RE, RAW_DIRECTORY
from bioregistry.external.alignment_utils import load_processed
from bioregistry.license_standardizer import standardize_license
from bioregistry.utils import removeprefix
__all__ = [
"get_agroportal",
"get_bioportal",
"get_ecoportal",
]
BIOPORTAL_BASE_URL = "https://data.bioontology.org"
ECOPORTAL_BASE_URL = "http://ecoportal.lifewatch.eu:8080"
AGROPORTAL_BASE_URL = "http://data.agroportal.lirmm.fr"
DIRECTORY = Path(__file__).parent.resolve()
@dataclass
class OntoPortalClient:
"""A client for an OntoPortal site, like BioPortal."""
metaprefix: str
base_url: str
api_key: Optional[str] = None
raw_path: Path = field(init=False)
processed_path: Path = field(init=False)
max_workers: int = 2
def __post_init__(self) -> None:
self.raw_path = RAW_DIRECTORY.joinpath(self.metaprefix).with_suffix(".json")
self.processed_path = DIRECTORY.joinpath(self.metaprefix).with_suffix(".json")
def query(self, url: str, **params: Any) -> requests.Response:
"""Query the given endpoint on the OntoPortal site.
:param url: URL to query
:param params: Kwargs to give as params to :func:`requests.get`
:returns: The response from :func:`requests.get`
The rate limit is 15 queries per second. See:
https://www.bioontology.org/wiki/Annotator_Optimizing_and_Troublehooting
"""
if self.api_key is None:
self.api_key = pystow.get_config(self.metaprefix, "api_key", raise_on_missing=True)
params.setdefault("apikey", self.api_key)
return requests.get(url, params=params, timeout=30)
def download(self, force_download: bool = False) -> dict[str, dict[str, Any]]:
"""Get the full dump of the OntoPortal site's registry."""
if self.processed_path.exists() and not force_download:
return load_processed(self.processed_path)
# see https://data.bioontology.org/documentation#Ontology
res = self.query(self.base_url + "/ontologies", summaryOnly=False, notes=True)
records = res.json()
records = thread_map( # type:ignore
self._preprocess,
records,
unit="ontology",
max_workers=self.max_workers,
desc=f"Preprocessing {self.metaprefix}",
)
with self.raw_path.open("w") as file:
json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False)
records = thread_map( # type:ignore
self.process, records, disable=True, description=f"Processing {self.metaprefix}"
)
rv = {result["prefix"]: result for result in records}
with self.processed_path.open("w") as file:
json.dump(rv, file, indent=2, sort_keys=True, ensure_ascii=False)
return rv
def _preprocess(self, record: dict[str, Any]) -> dict[str, Any]:
record.pop("@context", None)
prefix = record["acronym"]
url = f"{self.base_url}/ontologies/{prefix}/latest_submission"
res = self.query(url, display="all")
if res.status_code != 200:
tqdm.write(
f"{self.metaprefix}:{prefix} had issue getting submission details: {res.text}"
)
return record
res_json = res.json()
publications = res_json.get("publication")
if isinstance(publications, str):
record["publications"] = [publications]
elif isinstance(publications, list):
record["publications"] = publications
for key in [
"homepage",
"version",
"description",
"exampleIdentifier",
"repository",
]:
value = res_json.get(key)
if value:
if isinstance(value, list) and len(value) == 1:
value = value[0]
if isinstance(value, float) and not math.isnan(value):
value = str(value)
if not isinstance(value, str):
tqdm.write(f"got non-string value ({type(value)}) for {key}: {value}")
continue
record[key] = (
(value or "")
.strip()
.replace("\r\n", " ")
.replace("\r", " ")
.strip()
.replace(" ", " ")
.replace(" ", " ")
.replace(" ", " ")
)
license_stub = res_json.get("hasLicense")
if license_stub:
record["license"] = standardize_license(license_stub)
contacts = [
{k: v.strip() for k, v in contact.items() if not k.startswith("@") and v}
for contact in res_json.get("contact", [])
]
contacts = [contact for contact in contacts if EMAIL_RE.match(contact.get("email", ""))]
if contacts:
contact = contacts[0]
# TODO consider sorting contacts in a canonical order?
# contact = min(contacts, key=lambda c: len(c["email"]))
record["contact"] = {k: v for k, v in contact.items() if k != "id"}
name = record["contact"].get("name")
if name:
record["contact"]["name"] = removeprefix(removeprefix(name, "Dr. "), "Dr ")
return {k: v for k, v in record.items() if v}
def process(self, entry: dict[str, Any]) -> dict[str, Any]:
"""Process a record from the OntoPortal site's API."""
prefix = entry["acronym"]
rv = {
"prefix": prefix,
"name": entry["name"].strip(),
"description": entry.get("description"),
"contact": entry.get("contact"),
"homepage": entry.get("homepage"),
"version": entry.get("version"),
"publications": entry.get("publications"),
"repository": entry.get("repository"),
"example_uri": entry.get("exampleIdentifier"),
"license": entry.get("license"),
}
return {k: v for k, v in rv.items() if v}
bioportal_client = OntoPortalClient(
metaprefix="bioportal",
base_url=BIOPORTAL_BASE_URL,
)
[docs]
def get_bioportal(force_download: bool = False) -> dict[str, dict[str, Any]]:
"""Get the BioPortal registry."""
return bioportal_client.download(force_download=force_download)
ecoportal_client = OntoPortalClient(
metaprefix="ecoportal",
base_url=ECOPORTAL_BASE_URL,
)
def get_ecoportal(force_download: bool = False) -> dict[str, dict[str, Any]]:
"""Get the EcoPortal registry."""
return ecoportal_client.download(force_download=force_download)
agroportal_client = OntoPortalClient(
metaprefix="agroportal",
base_url=AGROPORTAL_BASE_URL,
)
def get_agroportal(force_download: bool = False) -> dict[str, dict[str, Any]]:
"""Get the AgroPortal registry."""
return agroportal_client.download(force_download=force_download)
if __name__ == "__main__":
print("EcoPortal has", len(get_ecoportal(force_download=True))) # noqa:T201
print("AgroPortal has", len(get_agroportal(force_download=True))) # noqa:T201
print("BioPortal has", len(get_bioportal(force_download=True))) # noqa:T201