Source code for bioregistry.external.re3data

# -*- coding: utf-8 -*-

"""Re3data is a registry of research data repositories.

Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772
"""

import json
import logging
from typing import Any, Mapping, Optional, Tuple
from xml.etree import ElementTree

import requests
from tqdm.contrib.concurrent import thread_map
from tqdm.contrib.logging import logging_redirect_tqdm

from bioregistry.constants import EXTERNAL
from bioregistry.utils import removeprefix

__all__ = [
    "get_re3data",
]

logger = logging.getLogger(__name__)
DIRECTORY = EXTERNAL / "re3data"
DIRECTORY.mkdir(exist_ok=True, parents=True)
PROCESSED_PATH = DIRECTORY / "processed.json"

BASE_URL = "https://www.re3data.org"
SCHEMA = "{http://www.re3data.org/schema/2-2}"


[docs]def get_re3data(force_download: bool = False): """Get the re3data registry. This takes about 9 minutes since it has to look up each of the ~3K records with their own API call. :param force_download: If true, re-downloads the data :returns: The re3data pre-processed data """ if PROCESSED_PATH.exists() and not force_download: with PROCESSED_PATH.open() as file: return json.load(file) res = requests.get(f"{BASE_URL}/api/v1/repositories") tree = ElementTree.fromstring(res.text) identifier_to_doi = {} for repository in tree.findall("repository"): identifier_element = repository.find("id") if identifier_element is None or identifier_element.text is None: continue doi_element = repository.find("doi") doi = ( removeprefix(doi_element.text, "https://doi.org/") if doi_element and doi_element.text else None ) identifier_to_doi[identifier_element.text.strip()] = doi records = dict( thread_map( _get_record, identifier_to_doi, unit_scale=True, unit="record", desc="Getting re3data", disable=True, ) ) # backfill DOIs for identifier, record in records.items(): doi = identifier_to_doi.get(identifier) if doi: record["doi"] = doi with PROCESSED_PATH.open("w") as file: json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False) return records
def _get_record(identifier: str) -> Tuple[str, Mapping[str, Any]]: res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}") tree = ElementTree.fromstring(res.text)[0] return identifier, _process_record(identifier, tree) def _process_record(identifier: str, tree_inner): xrefs = ( _clean_xref(element.text.strip()) for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier") ) data = { "prefix": identifier, "name": tree_inner.find(f"{SCHEMA}repositoryName").text, "description": tree_inner.find(f"{SCHEMA}description").text, "homepage": tree_inner.find(f"{SCHEMA}repositoryURL").text, "synonyms": [ element.text.strip() for element in tree_inner.findall(f"{SCHEMA}additionalName") ], "xrefs": dict(tup for tup in xrefs if tup), } license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName") if license_element: data["license"] = license_element.text return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v} def _clean_xref(xref: str) -> Optional[Tuple[str, str]]: if ( xref.startswith("FAIRsharing_DOI:10.25504/") or xref.startswith("FAIRsharing_doi:10.25504/") or xref.startswith("FAIRsharing_dOI:10.25504/") or xref.startswith("FAIRSharing_doi:10.25504/") or xref.startswith("FAIRsharing_doi;10.25504/") or xref.startswith("FAIRsharing_doi: 10.25504/") or xref.startswith("fairsharing_DOI:10.25504/") or xref.startswith("fairsharing_doi:10.25504/") or xref.startswith("FAIRsharin_doi:10.25504/") or xref.startswith("FAIRsharing_doi.:10.25504/") or xref.startswith("FAIRsharing_DOI: 10.25504/") or xref.startswith("FAIRsharing_doi::10.25504/") or xref.startswith("FAIRsharing_doi:10.24404/") ): return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :] for start, key in [ ("biodbcore-", "biodbcore"), ("MIR:", "miriam"), ("ROR:", "ror"), ("OMICS_", "omics"), ("Omics_", "omics"), ("omics_", "omics"), ("ISSN ", "issn"), ("ISSN: ", "issn"), ("nif-", "nif"), ("ISNI:", "isni"), ("doi.org/", "doi"), ("doi:", "doi"), ("DOI:", "doi"), ("DOI: ", "doi"), ("RID:nlx_", "nlx"), ("PSSB-", "pssb"), ("OpenDOAR:", "opendoar"), ("openDOAR:", "opendoar"), ("ROAR:", "roar"), # e.g., see http://roar.eprints.org/14208/ ("hdl:", "hdl"), ("https://fairsharing.org/", "fairsharing.legacy"), ("http://fairsharing.org/", "fairsharing.legacy"), ("Wikidata:", "wikidata"), ("https://doi.org/10.5281/zenodo.", "zenodo"), ("https://doi.org/", "doi"), ]: if xref.startswith(start): return key, xref[len(start) :] if xref.startswith("RRID:"): inner_xref = xref[len("RRID:") :] if "_" in inner_xref: prefix, identifier = inner_xref.split("_", 1) return prefix.lower(), identifier elif "-" in inner_xref: try: prefix, identifier = inner_xref.split("-", 1) except ValueError: logger.debug("can't parse RRID: %s", xref) else: return prefix.lower(), identifier else: logger.debug("unknown RRID: %s", xref) return None if "doi:" in xref: for part in xref.split(" "): if part.startswith("doi"): return "doi", part[len("doi:") :] logger.debug("re3data record had unparsable xref: %s", xref) return None if __name__ == "__main__": with logging_redirect_tqdm(): get_re3data(force_download=True)