"""Scraper for FAIRsharing.
.. seealso:: https://beta.fairsharing.org/API_doc
"""
from __future__ import annotations
import json
import logging
import re
from collections.abc import MutableMapping, Sequence
from pathlib import Path
from typing import Any, ClassVar
from bioregistry.constants import ORCID_PATTERN
from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.license_standardizer import standardize_license
from bioregistry.utils import removeprefix, removesuffix
__all__ = [
"FairsharingAligner",
"get_fairsharing",
]
logger = logging.getLogger(__name__)
DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"
ALLOWED_TYPES = {
"terminology_artefact",
"identifier_schema",
# "knowledgebase",
# "knowledgebase_and_repository",
# "repository",
}
ORCID_RE = re.compile(ORCID_PATTERN)
[docs]
def get_fairsharing(
*, force_download: bool = False, force_reload: bool = False, use_tqdm: bool = True
) -> dict[str, dict[str, Any]]:
"""Get the FAIRsharing registry."""
if PROCESSED_PATH.exists() and not force_download and not force_reload:
return load_processed(PROCESSED_PATH)
from fairsharing_client import load_fairsharing
data = load_fairsharing(force_download=force_download, use_tqdm=use_tqdm)
rv = {}
for prefix, record in data.items():
new_record = _process_record(record)
if new_record:
rv[prefix] = new_record
with PROCESSED_PATH.open("w") as file:
json.dump(rv, file, indent=2, ensure_ascii=False, sort_keys=True)
return rv
KEEP = {
"description",
"name",
"subjects",
"user_defined_tags",
"domains",
}
def _process_record(record: MutableMapping[str, Any]) -> dict[str, Any] | None:
if record.get("record_type") not in ALLOWED_TYPES:
return None
rv = {key: record[key] for key in KEEP if record[key]}
abbreviation = record.get("abbreviation")
if abbreviation:
for suf in [
" CT",
" CV",
" Controlled Vocabulary",
" Terminology",
" Ontology",
" Thesaurus",
" Vocabulary",
" Taxonomy",
]:
rv["abbreviation"] = removesuffix(abbreviation, suf)
metadata = record.get("metadata", {})
url_for_logo = record.get("url_for_logo")
if url_for_logo is not None:
rv["logo"] = "https://api.fairsharing.org" + url_for_logo
homepage = metadata.get("homepage")
if homepage:
rv["homepage"] = homepage
rv["publications"] = list(
filter(
None,
(_process_publication(publication) for publication in record.get("publications", [])),
)
)
contacts = [
{removeprefix(k, "contact_"): v for k, v in contact.items()}
for contact in metadata.get("contacts", [])
# make sure ORCID is available and valid
if (orcid := contact.get("contact_orcid")) and ORCID_RE.match(orcid)
]
for contact in contacts:
contact["name"] = removeprefix(removeprefix(contact["name"], "Dr. "), "Dr ")
if "orcid" in contact:
contact["orcid"] = contact["orcid"].replace(" ", "")
if contacts:
rv["contact"] = contacts[0]
for support_link in metadata.get("support_links", []):
if support_link["type"] == "Twitter":
rv["twitter"] = removeprefix(support_link["url"], "https://twitter.com/")
if support_link["type"] == "Github":
rv["repository"] = support_link["url"]
missed = set()
for license_link in record.get("licence_links", []):
url = license_link.get("licence_url")
if not url:
continue
license_standard = standardize_license(url)
if license_standard == url:
if license_standard not in missed and license_standard not in SKIP_LICENSES:
missed.add(license_standard)
logger.debug("Need to curate license URL: %s", license_standard)
continue
else:
rv["license"] = license_standard
rv = {k: v for k, v in rv.items() if k and v}
return rv
#: Licenses that are one-off and don't need curating
SKIP_LICENSES: set[str] = set()
def _process_publication(publication: dict[str, Any]) -> dict[str, Any] | None:
rv = {}
doi = publication.get("doi")
if doi:
doi = doi.rstrip(".").lower()
doi = removeprefix(doi, "doi:")
doi = removeprefix(doi, "https://doi.org/")
if "/" not in doi:
doi = None
else:
rv["doi"] = doi
pubmed = publication.get("pubmed_id")
if pubmed:
rv["pubmed"] = str(pubmed)
if not doi and not pubmed:
return None
title = publication.get("title")
if title:
title = title.replace(" ", " ").rstrip(".")
rv["title"] = title
year = publication.get("year")
if year:
rv["year"] = int(year)
return rv
class FairsharingAligner(Aligner):
"""Aligner for the FAIRsharing."""
key = "fairsharing"
alt_key_match = "abbreviation"
skip_deprecated = True
getter = get_fairsharing
curation_header: ClassVar[Sequence[str]] = ("abbreviation", "name", "description")
if __name__ == "__main__":
FairsharingAligner.cli()