Source code for bioregistry.external.fairsharing

# -*- coding: utf-8 -*-

"""Scraper for FAIRsharing.

.. seealso:: https://beta.fairsharing.org/API_doc
"""

import json
from typing import Any, MutableMapping, Optional

from bioregistry.constants import EXTERNAL
from bioregistry.license_standardizer import standardize_license
from bioregistry.utils import removeprefix, removesuffix

__all__ = [
    "get_fairsharing",
]

DIRECTORY = EXTERNAL / "fairsharing"
DIRECTORY.mkdir(exist_ok=True, parents=True)
PROCESSED_PATH = DIRECTORY / "processed.json"


ALLOWED_TYPES = {
    "terminology_artefact",
    "identifier_schema",
    # "knowledgebase",
    # "knowledgebase_and_repository",
    # "repository",
}


[docs]def get_fairsharing(force_download: bool = False, use_tqdm: bool = False): """Get the FAIRsharing registry.""" if PROCESSED_PATH.exists() and not force_download: with PROCESSED_PATH.open() as file: return json.load(file) from fairsharing_client import load_fairsharing data = load_fairsharing(force_download=force_download, use_tqdm=use_tqdm) rv = {} for prefix, record in data.items(): new_record = _process_record(record) if new_record: rv[prefix] = new_record with PROCESSED_PATH.open("w") as file: json.dump(rv, file, indent=2, ensure_ascii=False, sort_keys=True) return rv
KEEP = { "abbreviation", "description", "name", "subjects", } def _process_record(record: MutableMapping[str, Any]) -> Optional[MutableMapping[str, Any]]: if record.get("record_type") not in ALLOWED_TYPES: return None rv = {key: record[key] for key in KEEP} for suf in [ " CT", " CV", " Controlled Vocabulary", " Terminology", " Ontology", " Thesaurus", " Vocabulary", " Taxonomy", ]: rv["abbreviation"] = removesuffix(rv["abbreviation"], suf) metadata = record.get("metadata", {}) homepage = metadata.get("homepage") if homepage: rv["homepage"] = homepage rv["publications"] = list( filter( None, (_process_publication(publication) for publication in record.get("publications", [])), ) ) contacts = [ {removeprefix(k, "contact_"): v for k, v in contact.items()} for contact in metadata.get("contacts", []) if contact.get("contact_orcid") # make sure ORCID is available ] for contact in contacts: contact["name"] = removeprefix(removeprefix(contact["name"], "Dr. "), "Dr ") if "orcid" in contact: contact["orcid"] = contact["orcid"].replace(" ", "") if contacts: rv["contact"] = contacts[0] for support_link in metadata.get("support_links", []): if support_link["type"] == "Twitter": rv["twitter"] = removeprefix(support_link["url"], "https://twitter.com/") if support_link["type"] == "Github": rv["repository"] = support_link["url"] for license_link in record.get("licence_links", []): url = license_link.get("licence_url") if not url: continue license_standard = standardize_license(url) if license_standard == url: continue # TODO curate additional normalizations else: rv["license"] = license_standard return rv def _process_publication(publication): rv = {} doi = publication.get("doi") if doi: doi = doi.rstrip(".").lower() doi = removeprefix(doi, "doi:") doi = removeprefix(doi, "https://doi.org/") if "/" not in doi: doi = None else: rv["doi"] = doi pubmed = publication.get("pubmed_id") if pubmed: rv["pubmed"] = str(pubmed) if not doi and not pubmed: return title = publication.get("title") if title: title = title.replace(" ", " ").rstrip(".") rv["title"] = title return rv # TODO add "year" if __name__ == "__main__": get_fairsharing(force_download=True)