Source code for bioregistry.external.re3data

"""Re3data is a registry of research data repositories.

Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772
"""

import json
import logging
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar
from xml.etree import ElementTree

import requests
from tqdm.contrib.concurrent import thread_map

from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.utils import removeprefix

__all__ = [
    "Re3dataAligner",
    "get_re3data",
]

logger = logging.getLogger(__name__)
DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"

BASE_URL = "https://www.re3data.org"
SCHEMA = "{http://www.re3data.org/schema/2-2}"


[docs] def get_re3data(force_download: bool = False) -> dict[str, dict[str, Any]]: """Get the re3data registry. This takes about 9 minutes since it has to look up each of the ~3K records with their own API call. :param force_download: If true, re-downloads the data :returns: The re3data pre-processed data """ if PROCESSED_PATH.exists() and not force_download: return load_processed(PROCESSED_PATH) res = requests.get(f"{BASE_URL}/api/v1/repositories", timeout=15) tree = ElementTree.fromstring(res.text) identifier_to_doi = {} for repository in tree.findall("repository"): identifier_element = repository.find("id") if identifier_element is None or identifier_element.text is None: continue doi_element = repository.find("doi") doi = ( removeprefix(doi_element.text, "https://doi.org/") if doi_element is not None and doi_element.text else None ) identifier_to_doi[identifier_element.text.strip()] = doi records = dict( thread_map( # type:ignore _get_record, identifier_to_doi, unit_scale=True, unit="record", desc="Getting re3data", disable=True, ) ) # backfill DOIs for identifier, record in records.items(): doi = identifier_to_doi.get(identifier) if doi: record["doi"] = doi with PROCESSED_PATH.open("w") as file: json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False) return records
def _get_record(identifier: str) -> tuple[str, Mapping[str, Any]]: res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}", timeout=15) tree = ElementTree.fromstring(res.text)[0] return identifier, _process_record(identifier, tree) def _process_record(identifier: str, tree_inner: ElementTree.Element) -> dict[str, Any]: xrefs = ( _clean_xref(element.text.strip()) for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier") if element.text is not None ) data = { "prefix": identifier, "name": tree_inner.findtext(f"{SCHEMA}repositoryName"), "description": tree_inner.findtext(f"{SCHEMA}description"), "homepage": tree_inner.findtext(f"{SCHEMA}repositoryURL"), "synonyms": [ element.text.strip() for element in tree_inner.findall(f"{SCHEMA}additionalName") if element.text is not None ], "xrefs": dict(tup for tup in xrefs if tup), } license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName") if license_element is not None: data["license"] = license_element.text return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v} def _clean_xref(xref: str) -> tuple[str, str] | None: if ( xref.startswith("FAIRsharing_DOI:10.25504/") or xref.startswith("FAIRsharing_doi:10.25504/") or xref.startswith("FAIRsharing_dOI:10.25504/") or xref.startswith("FAIRSharing_doi:10.25504/") or xref.startswith("FAIRsharing_doi;10.25504/") or xref.startswith("FAIRsharing_doi: 10.25504/") or xref.startswith("fairsharing_DOI:10.25504/") or xref.startswith("fairsharing_doi:10.25504/") or xref.startswith("FAIRsharin_doi:10.25504/") or xref.startswith("FAIRsharing_doi.:10.25504/") or xref.startswith("FAIRsharing_DOI: 10.25504/") or xref.startswith("FAIRsharing_doi::10.25504/") or xref.startswith("FAIRsharing_doi:10.24404/") ): return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :] for start, key in [ ("biodbcore-", "biodbcore"), ("MIR:", "miriam"), ("ROR:", "ror"), ("OMICS_", "omics"), ("Omics_", "omics"), ("omics_", "omics"), ("ISSN ", "issn"), ("ISSN: ", "issn"), ("nif-", "nif"), ("ISNI:", "isni"), ("doi.org/", "doi"), ("doi:", "doi"), ("DOI:", "doi"), ("DOI: ", "doi"), ("RID:nlx_", "nlx"), ("PSSB-", "pssb"), ("OpenDOAR:", "opendoar"), ("openDOAR:", "opendoar"), ("ROAR:", "roar"), # e.g., see http://roar.eprints.org/14208/ ("hdl:", "hdl"), ("https://fairsharing.org/", "fairsharing.legacy"), ("http://fairsharing.org/", "fairsharing.legacy"), ("Wikidata:", "wikidata"), ("https://doi.org/10.5281/zenodo.", "zenodo"), ("https://doi.org/", "doi"), ]: if xref.startswith(start): return key, xref[len(start) :] if xref.startswith("RRID:"): inner_xref = xref[len("RRID:") :] if "_" in inner_xref: prefix, identifier = inner_xref.split("_", 1) return prefix.lower(), identifier elif "-" in inner_xref: try: prefix, identifier = inner_xref.split("-", 1) except ValueError: logger.debug("can't parse RRID: %s", xref) else: return prefix.lower(), identifier else: logger.debug("unknown RRID: %s", xref) return None if "doi:" in xref: for part in xref.split(" "): if part.startswith("doi"): return "doi", part[len("doi:") :] logger.debug("re3data record had unparsable xref: %s", xref) return None class Re3dataAligner(Aligner): """Aligner for the Registry of Research Data Repositoris (r3data).""" key = "re3data" alt_key_match = "name" getter = get_re3data curation_header: ClassVar[Sequence[str]] = ("name", "homepage", "description") if __name__ == "__main__": Re3dataAligner.cli()