# -*- coding: utf-8 -*-
"""Scraper for FAIRsharing.
.. seealso:: https://beta.fairsharing.org/API_doc
"""
import json
from typing import Any, Iterable, Mapping, MutableMapping, Optional
import pystow
import requests
from tqdm import tqdm
from bioregistry.constants import EXTERNAL
from bioregistry.utils import removeprefix, removesuffix
__all__ = [
"get_fairsharing",
]
DIRECTORY = EXTERNAL / "fairsharing"
DIRECTORY.mkdir(exist_ok=True, parents=True)
PROCESSED_PATH = DIRECTORY / "processed.json"
BASE_URL = "https://api.fairsharing.org"
SIGNIN_URL = f"{BASE_URL}/users/sign_in"
RECORDS_URL = f"{BASE_URL}/fairsharing_records"
ALLOWED_TYPES = {
"terminology_artefact",
# "knowledgebase",
# "knowledgebase_and_repository",
# "repository",
}
[docs]def get_fairsharing(force_download: bool = False, use_tqdm: bool = False):
"""Get the FAIRsharing registry."""
if PROCESSED_PATH.exists() and not force_download:
with PROCESSED_PATH.open() as file:
return json.load(file)
client = FairsharingClient()
# As of 2021-12-13, there are a bit less than 4k records that take about 3 minutes to download
rv = {
row.pop("prefix"): row
for row in tqdm(
client.iter_records(),
unit_scale=True,
unit="record",
desc="Downloading FAIRsharing",
disable=not use_tqdm,
)
}
with PROCESSED_PATH.open("w") as file:
json.dump(rv, file, indent=2, ensure_ascii=False, sort_keys=True)
return rv
KEEP = {
"abbreviation",
"description",
"id",
"name",
"prefix",
"subjects",
"publications",
}
class FairsharingClient:
"""A client for programmatic access to the FAIRsharing private API."""
def __init__(self, user: Optional[str] = None, password: Optional[str] = None):
"""Instantiate the client and get an appropriate JWT token.
:param user: FAIRsharing username
:param password: Corresponding FAIRsharing password
"""
self.username = pystow.get_config(
"fairsharing", "login", passthrough=user, raise_on_missing=True
)
self.password = pystow.get_config(
"fairsharing", "password", passthrough=password, raise_on_missing=True
)
self.jwt = self.get_jwt()
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": f"Bearer {self.jwt}",
}
)
def get_jwt(self) -> str:
"""Get the JWT."""
payload = {
"user": {
"login": self.username,
"password": self.password,
},
}
res = requests.post(SIGNIN_URL, json=payload).json()
return res["jwt"]
def iter_records(self) -> Iterable[Mapping[str, Any]]:
"""Iterate over all FAIRsharing records."""
yield from self._iter_records_helper(RECORDS_URL)
def _preprocess_record(
self, record: MutableMapping[str, Any]
) -> Optional[MutableMapping[str, Any]]:
if "type" in record:
del record["type"]
record = {"id": record["id"], **record["attributes"]}
if record.get("record_type") not in ALLOWED_TYPES:
return None
doi = record.get("doi")
if doi is None:
# tqdm.write(f"{record['id']} has no DOI: {record['url']}")
# these records are not possible to resolve
return None
if doi.startswith("10.25504/"):
record["prefix"] = record.pop("doi")[len("10.25504/") :]
else:
tqdm.write(f"DOI has unexpected prefix: {record['doi']}")
for suf in [
" CT",
" CV",
"Controlled Vocabulary",
" Terminology",
" Ontology",
" Thesaurus",
" Vocabulary",
" Taxonomy",
]:
record["abbreviation"] = removesuffix(record["abbreviation"], suf)
record["description"] = removeprefix(
record.get("description"), "This FAIRsharing record describes: "
)
record["name"] = removeprefix(record.get("name"), "FAIRsharing record for: ")
record["publications"] = [
{k: publication[k] for k in ("doi", "pubmed_id", "title")}
for publication in record.get("publications", [])
if publication.get("doi") or publication.get("pubmed_id")
]
# for key in [
# "created-at",
# "domains", # maybe use later
# "legacy-ids",
# "fairsharing-licence", # redundant across all records
# "licence-links",
# "taxonomies",
# "updated-at",
# "url-for-logo",
# "user-defined-tags",
# "countries",
# "fairsharing-registry",
# "record-type",
# "url", # redundant of doi
# ]
return {key: value for key, value in record.items() if key in KEEP}
def _iter_records_helper(self, url: str) -> Iterable[Mapping[str, Any]]:
res = self.session.get(url).json()
for record in res["data"]:
yv = self._preprocess_record(record)
if yv:
yield yv
next_url = res["links"].get("next")
if next_url:
yield from self._iter_records_helper(next_url)
if __name__ == "__main__":
get_fairsharing(force_download=True)