Source code for bioregistry.external.fairsharing

"""Scraper for FAIRsharing.

.. seealso:: https://beta.fairsharing.org/API_doc
"""

from __future__ import annotations

import json
import logging
import re
from collections.abc import MutableMapping, Sequence
from pathlib import Path
from typing import Any, ClassVar

from bioregistry.constants import ORCID_PATTERN
from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.license_standardizer import standardize_license
from bioregistry.utils import removeprefix, removesuffix

__all__ = [
    "FairsharingAligner",
    "get_fairsharing",
]

logger = logging.getLogger(__name__)

DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"


ALLOWED_TYPES = {
    "terminology_artefact",
    "identifier_schema",
    # "knowledgebase",
    # "knowledgebase_and_repository",
    # "repository",
}

ORCID_RE = re.compile(ORCID_PATTERN)


[docs] def get_fairsharing( *, force_download: bool = False, force_reload: bool = False, use_tqdm: bool = True ) -> dict[str, dict[str, Any]]: """Get the FAIRsharing registry.""" if PROCESSED_PATH.exists() and not force_download and not force_reload: return load_processed(PROCESSED_PATH) from fairsharing_client import load_fairsharing data = load_fairsharing(force_download=force_download, use_tqdm=use_tqdm) rv = {} for prefix, record in data.items(): new_record = _process_record(record) if new_record: rv[prefix] = new_record with PROCESSED_PATH.open("w") as file: json.dump(rv, file, indent=2, ensure_ascii=False, sort_keys=True) return rv
KEEP = { "description", "name", "subjects", "user_defined_tags", "domains", } def _process_record(record: MutableMapping[str, Any]) -> dict[str, Any] | None: if record.get("record_type") not in ALLOWED_TYPES: return None rv = {key: record[key] for key in KEEP if record[key]} abbreviation = record.get("abbreviation") if abbreviation: for suf in [ " CT", " CV", " Controlled Vocabulary", " Terminology", " Ontology", " Thesaurus", " Vocabulary", " Taxonomy", ]: rv["abbreviation"] = removesuffix(abbreviation, suf) metadata = record.get("metadata", {}) url_for_logo = record.get("url_for_logo") if url_for_logo is not None: rv["logo"] = "https://api.fairsharing.org" + url_for_logo homepage = metadata.get("homepage") if homepage: rv["homepage"] = homepage rv["publications"] = list( filter( None, (_process_publication(publication) for publication in record.get("publications", [])), ) ) contacts = [ {removeprefix(k, "contact_"): v for k, v in contact.items()} for contact in metadata.get("contacts", []) # make sure ORCID is available and valid if (orcid := contact.get("contact_orcid")) and ORCID_RE.match(orcid) ] for contact in contacts: contact["name"] = removeprefix(removeprefix(contact["name"], "Dr. "), "Dr ") if "orcid" in contact: contact["orcid"] = contact["orcid"].replace(" ", "") if contacts: rv["contact"] = contacts[0] for support_link in metadata.get("support_links", []): if support_link["type"] == "Twitter": rv["twitter"] = removeprefix(support_link["url"], "https://twitter.com/") if support_link["type"] == "Github": rv["repository"] = support_link["url"] missed = set() for license_link in record.get("licence_links", []): url = license_link.get("licence_url") if not url: continue license_standard = standardize_license(url) if license_standard == url: if license_standard not in missed and license_standard not in SKIP_LICENSES: missed.add(license_standard) logger.debug("Need to curate license URL: %s", license_standard) continue else: rv["license"] = license_standard rv = {k: v for k, v in rv.items() if k and v} return rv #: Licenses that are one-off and don't need curating SKIP_LICENSES: set[str] = set() def _process_publication(publication: dict[str, Any]) -> dict[str, Any] | None: rv = {} doi = publication.get("doi") if doi: doi = doi.rstrip(".").lower() doi = removeprefix(doi, "doi:") doi = removeprefix(doi, "https://doi.org/") if "/" not in doi: doi = None else: rv["doi"] = doi pubmed = publication.get("pubmed_id") if pubmed: rv["pubmed"] = str(pubmed) if not doi and not pubmed: return None title = publication.get("title") if title: title = title.replace(" ", " ").rstrip(".") rv["title"] = title year = publication.get("year") if year: rv["year"] = int(year) return rv class FairsharingAligner(Aligner): """Aligner for the FAIRsharing.""" key = "fairsharing" alt_key_match = "abbreviation" skip_deprecated = True getter = get_fairsharing curation_header: ClassVar[Sequence[str]] = ("abbreviation", "name", "description") if __name__ == "__main__": FairsharingAligner.cli()