Source code for bioregistry.external.re3data

"""Re3data is a registry of research data repositories.

Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772
"""

import json
import logging
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar, Optional
from xml.etree import ElementTree

import requests
from tqdm.contrib.concurrent import thread_map

from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.utils import removeprefix

__all__ = [
    "Re3dataAligner",
    "get_re3data",
]

logger = logging.getLogger(__name__)
DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"

BASE_URL = "https://www.re3data.org"
SCHEMA = "{http://www.re3data.org/schema/2-2}"


[docs] def get_re3data(force_download: bool = False) -> dict[str, dict[str, Any]]: """Get the re3data registry. This takes about 9 minutes since it has to look up each of the ~3K records with their own API call. :param force_download: If true, re-downloads the data :returns: The re3data pre-processed data """ if PROCESSED_PATH.exists() and not force_download: return load_processed(PROCESSED_PATH) res = requests.get(f"{BASE_URL}/api/v1/repositories", timeout=15) tree = ElementTree.fromstring(res.text) identifier_to_doi = {} for repository in tree.findall("repository"): identifier_element = repository.find("id") if identifier_element is None or identifier_element.text is None: continue doi_element = repository.find("doi") doi = ( removeprefix(doi_element.text, "https://doi.org/") if doi_element is not None and doi_element.text else None ) identifier_to_doi[identifier_element.text.strip()] = doi records = dict( thread_map( # type:ignore _get_record, identifier_to_doi, unit_scale=True, unit="record", desc="Getting re3data", disable=True, ) ) # backfill DOIs for identifier, record in records.items(): doi = identifier_to_doi.get(identifier) if doi: record["doi"] = doi with PROCESSED_PATH.open("w") as file: json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False) return records
def _get_record(identifier: str) -> tuple[str, Mapping[str, Any]]: res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}", timeout=15) tree = ElementTree.fromstring(res.text)[0] return identifier, _process_record(identifier, tree) def _process_record(identifier: str, tree_inner: ElementTree.Element) -> dict[str, Any]: xrefs = ( _clean_xref(element.text.strip()) for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier") if element.text is not None ) data = { "prefix": identifier, "name": tree_inner.findtext(f"{SCHEMA}repositoryName"), "description": tree_inner.findtext(f"{SCHEMA}description"), "homepage": tree_inner.findtext(f"{SCHEMA}repositoryURL"), "synonyms": [ element.text.strip() for element in tree_inner.findall(f"{SCHEMA}additionalName") if element.text is not None ], "xrefs": dict(tup for tup in xrefs if tup), } license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName") if license_element is not None: data["license"] = license_element.text return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v} def _clean_xref(xref: str) -> Optional[tuple[str, str]]: if ( xref.startswith("FAIRsharing_DOI:10.25504/") or xref.startswith("FAIRsharing_doi:10.25504/") or xref.startswith("FAIRsharing_dOI:10.25504/") or xref.startswith("FAIRSharing_doi:10.25504/") or xref.startswith("FAIRsharing_doi;10.25504/") or xref.startswith("FAIRsharing_doi: 10.25504/") or xref.startswith("fairsharing_DOI:10.25504/") or xref.startswith("fairsharing_doi:10.25504/") or xref.startswith("FAIRsharin_doi:10.25504/") or xref.startswith("FAIRsharing_doi.:10.25504/") or xref.startswith("FAIRsharing_DOI: 10.25504/") or xref.startswith("FAIRsharing_doi::10.25504/") or xref.startswith("FAIRsharing_doi:10.24404/") ): return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :] for start, key in [ ("biodbcore-", "biodbcore"), ("MIR:", "miriam"), ("ROR:", "ror"), ("OMICS_", "omics"), ("Omics_", "omics"), ("omics_", "omics"), ("ISSN ", "issn"), ("ISSN: ", "issn"), ("nif-", "nif"), ("ISNI:", "isni"), ("doi.org/", "doi"), ("doi:", "doi"), ("DOI:", "doi"), ("DOI: ", "doi"), ("RID:nlx_", "nlx"), ("PSSB-", "pssb"), ("OpenDOAR:", "opendoar"), ("openDOAR:", "opendoar"), ("ROAR:", "roar"), # e.g., see http://roar.eprints.org/14208/ ("hdl:", "hdl"), ("https://fairsharing.org/", "fairsharing.legacy"), ("http://fairsharing.org/", "fairsharing.legacy"), ("Wikidata:", "wikidata"), ("https://doi.org/10.5281/zenodo.", "zenodo"), ("https://doi.org/", "doi"), ]: if xref.startswith(start): return key, xref[len(start) :] if xref.startswith("RRID:"): inner_xref = xref[len("RRID:") :] if "_" in inner_xref: prefix, identifier = inner_xref.split("_", 1) return prefix.lower(), identifier elif "-" in inner_xref: try: prefix, identifier = inner_xref.split("-", 1) except ValueError: logger.debug("can't parse RRID: %s", xref) else: return prefix.lower(), identifier else: logger.debug("unknown RRID: %s", xref) return None if "doi:" in xref: for part in xref.split(" "): if part.startswith("doi"): return "doi", part[len("doi:") :] logger.debug("re3data record had unparsable xref: %s", xref) return None class Re3dataAligner(Aligner): """Aligner for the Registry of Research Data Repositoris (r3data).""" key = "re3data" alt_key_match = "name" getter = get_re3data curation_header: ClassVar[Sequence[str]] = ("name", "homepage", "description") if __name__ == "__main__": Re3dataAligner.cli()