Source code for bioregistry.external.fairsharing

# -*- coding: utf-8 -*-

"""Scraper for FAIRsharing.

.. seealso:: https://beta.fairsharing.org/API_doc
"""

import json
from typing import Any, Iterable, Mapping, MutableMapping, Optional

import pystow
import requests
from tqdm import tqdm

from bioregistry.constants import EXTERNAL
from bioregistry.utils import removeprefix, removesuffix

__all__ = [
    "get_fairsharing",
]

DIRECTORY = EXTERNAL / "fairsharing"
DIRECTORY.mkdir(exist_ok=True, parents=True)
PROCESSED_PATH = DIRECTORY / "processed.json"

BASE_URL = "https://api.fairsharing.org"
SIGNIN_URL = f"{BASE_URL}/users/sign_in"
RECORDS_URL = f"{BASE_URL}/fairsharing_records"


ALLOWED_TYPES = {
    "terminology_artefact",
    # "knowledgebase",
    # "knowledgebase_and_repository",
    # "repository",
}


[docs]def get_fairsharing(force_download: bool = False, use_tqdm: bool = False): """Get the FAIRsharing registry.""" if PROCESSED_PATH.exists() and not force_download: with PROCESSED_PATH.open() as file: return json.load(file) client = FairsharingClient() # As of 2021-12-13, there are a bit less than 4k records that take about 3 minutes to download rv = { row.pop("prefix"): row for row in tqdm( client.iter_records(), unit_scale=True, unit="record", desc="Downloading FAIRsharing", disable=not use_tqdm, ) } with PROCESSED_PATH.open("w") as file: json.dump(rv, file, indent=2, ensure_ascii=False, sort_keys=True) return rv
KEEP = { "abbreviation", "description", "id", "name", "prefix", "subjects", "publications", } class FairsharingClient: """A client for programmatic access to the FAIRsharing private API.""" def __init__(self, user: Optional[str] = None, password: Optional[str] = None): """Instantiate the client and get an appropriate JWT token. :param user: FAIRsharing username :param password: Corresponding FAIRsharing password """ self.username = pystow.get_config( "fairsharing", "login", passthrough=user, raise_on_missing=True ) self.password = pystow.get_config( "fairsharing", "password", passthrough=password, raise_on_missing=True ) self.jwt = self.get_jwt() self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "Content-Type": "application/json", "Authorization": f"Bearer {self.jwt}", } ) def get_jwt(self) -> str: """Get the JWT.""" payload = { "user": { "login": self.username, "password": self.password, }, } res = requests.post(SIGNIN_URL, json=payload).json() return res["jwt"] def iter_records(self) -> Iterable[Mapping[str, Any]]: """Iterate over all FAIRsharing records.""" yield from self._iter_records_helper(RECORDS_URL) def _preprocess_record( self, record: MutableMapping[str, Any] ) -> Optional[MutableMapping[str, Any]]: if "type" in record: del record["type"] record = {"id": record["id"], **record["attributes"]} if record.get("record_type") not in ALLOWED_TYPES: return None doi = record.get("doi") if doi is None: # tqdm.write(f"{record['id']} has no DOI: {record['url']}") # these records are not possible to resolve return None if doi.startswith("10.25504/"): record["prefix"] = record.pop("doi")[len("10.25504/") :] else: tqdm.write(f"DOI has unexpected prefix: {record['doi']}") for suf in [ " CT", " CV", "Controlled Vocabulary", " Terminology", " Ontology", " Thesaurus", " Vocabulary", " Taxonomy", ]: record["abbreviation"] = removesuffix(record["abbreviation"], suf) record["description"] = removeprefix( record.get("description"), "This FAIRsharing record describes: " ) record["name"] = removeprefix(record.get("name"), "FAIRsharing record for: ") record["publications"] = [ {k: publication[k] for k in ("doi", "pubmed_id", "title")} for publication in record.get("publications", []) if publication.get("doi") or publication.get("pubmed_id") ] # for key in [ # "created-at", # "domains", # maybe use later # "legacy-ids", # "fairsharing-licence", # redundant across all records # "licence-links", # "taxonomies", # "updated-at", # "url-for-logo", # "user-defined-tags", # "countries", # "fairsharing-registry", # "record-type", # "url", # redundant of doi # ] return {key: value for key, value in record.items() if key in KEEP} def _iter_records_helper(self, url: str) -> Iterable[Mapping[str, Any]]: res = self.session.get(url).json() for record in res["data"]: yv = self._preprocess_record(record) if yv: yield yv next_url = res["links"].get("next") if next_url: yield from self._iter_records_helper(next_url) if __name__ == "__main__": get_fairsharing(force_download=True)