Source code for bioregistry.external.re3data

"""Re3data is a registry of research data repositories.

Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772
"""

import json
import logging
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar
from xml.etree import ElementTree

import requests
from tqdm.contrib.concurrent import thread_map

from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.utils import removeprefix

__all__ = [
    "Re3dataAligner",
    "get_re3data",
]

logger = logging.getLogger(__name__)
DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"

BASE_URL = "https://www.re3data.org"
SCHEMA = "{http://www.re3data.org/schema/2-2}"



[docs]
def get_re3data(force_download: bool = False) -> dict[str, dict[str, Any]]:
    """Get the re3data registry.

    This takes about 9 minutes since it has to look up each of the ~3K records with
    their own API call.

    :param force_download: If true, re-downloads the data

    :returns: The re3data pre-processed data
    """
    if PROCESSED_PATH.exists() and not force_download:
        return load_processed(PROCESSED_PATH)

    res = requests.get(f"{BASE_URL}/api/v1/repositories", timeout=15)
    tree = ElementTree.fromstring(res.text)

    identifier_to_doi = {}
    for repository in tree.findall("repository"):
        identifier_element = repository.find("id")
        if identifier_element is None or identifier_element.text is None:
            continue

        doi_element = repository.find("doi")
        doi = (
            removeprefix(doi_element.text, "https://doi.org/")
            if doi_element is not None and doi_element.text
            else None
        )
        identifier_to_doi[identifier_element.text.strip()] = doi

    records = dict(
        thread_map(  # type:ignore
            _get_record,
            identifier_to_doi,
            unit_scale=True,
            unit="record",
            desc="Getting re3data",
            disable=True,
        )
    )

    # backfill DOIs
    for identifier, record in records.items():
        doi = identifier_to_doi.get(identifier)
        if doi:
            record["doi"] = doi

    with PROCESSED_PATH.open("w") as file:
        json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False)

    return records



def _get_record(identifier: str) -> tuple[str, Mapping[str, Any]]:
    res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}", timeout=15)
    tree = ElementTree.fromstring(res.text)[0]
    return identifier, _process_record(identifier, tree)


def _process_record(identifier: str, tree_inner: ElementTree.Element) -> dict[str, Any]:
    xrefs = (
        _clean_xref(element.text.strip())
        for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier")
        if element.text is not None
    )
    data = {
        "prefix": identifier,
        "name": tree_inner.findtext(f"{SCHEMA}repositoryName"),
        "description": tree_inner.findtext(f"{SCHEMA}description"),
        "homepage": tree_inner.findtext(f"{SCHEMA}repositoryURL"),
        "synonyms": [
            element.text.strip()
            for element in tree_inner.findall(f"{SCHEMA}additionalName")
            if element.text is not None
        ],
        "xrefs": dict(tup for tup in xrefs if tup),
    }

    license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName")
    if license_element is not None:
        data["license"] = license_element.text

    return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v}


def _clean_xref(xref: str) -> tuple[str, str] | None:
    if (
        xref.startswith("FAIRsharing_DOI:10.25504/")
        or xref.startswith("FAIRsharing_doi:10.25504/")
        or xref.startswith("FAIRsharing_dOI:10.25504/")
        or xref.startswith("FAIRSharing_doi:10.25504/")
        or xref.startswith("FAIRsharing_doi;10.25504/")
        or xref.startswith("FAIRsharing_doi: 10.25504/")
        or xref.startswith("fairsharing_DOI:10.25504/")
        or xref.startswith("fairsharing_doi:10.25504/")
        or xref.startswith("FAIRsharin_doi:10.25504/")
        or xref.startswith("FAIRsharing_doi.:10.25504/")
        or xref.startswith("FAIRsharing_DOI: 10.25504/")
        or xref.startswith("FAIRsharing_doi::10.25504/")
        or xref.startswith("FAIRsharing_doi:10.24404/")
    ):
        return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :]

    for start, key in [
        ("biodbcore-", "biodbcore"),
        ("MIR:", "miriam"),
        ("ROR:", "ror"),
        ("OMICS_", "omics"),
        ("Omics_", "omics"),
        ("omics_", "omics"),
        ("ISSN ", "issn"),
        ("ISSN: ", "issn"),
        ("nif-", "nif"),
        ("ISNI:", "isni"),
        ("doi.org/", "doi"),
        ("doi:", "doi"),
        ("DOI:", "doi"),
        ("DOI: ", "doi"),
        ("RID:nlx_", "nlx"),
        ("PSSB-", "pssb"),
        ("OpenDOAR:", "opendoar"),
        ("openDOAR:", "opendoar"),
        ("ROAR:", "roar"),  # e.g., see http://roar.eprints.org/14208/
        ("hdl:", "hdl"),
        ("https://fairsharing.org/", "fairsharing.legacy"),
        ("http://fairsharing.org/", "fairsharing.legacy"),
        ("Wikidata:", "wikidata"),
        ("https://doi.org/10.5281/zenodo.", "zenodo"),
        ("https://doi.org/", "doi"),
    ]:
        if xref.startswith(start):
            return key, xref[len(start) :]

    if xref.startswith("RRID:"):
        inner_xref = xref[len("RRID:") :]
        if "_" in inner_xref:
            prefix, identifier = inner_xref.split("_", 1)
            return prefix.lower(), identifier
        elif "-" in inner_xref:
            try:
                prefix, identifier = inner_xref.split("-", 1)
            except ValueError:
                logger.debug("can't parse RRID: %s", xref)
            else:
                return prefix.lower(), identifier
        else:
            logger.debug("unknown RRID: %s", xref)
            return None

    if "doi:" in xref:
        for part in xref.split(" "):
            if part.startswith("doi"):
                return "doi", part[len("doi:") :]

    logger.debug("re3data record had unparsable xref: %s", xref)
    return None


class Re3dataAligner(Aligner):
    """Aligner for the Registry of Research Data Repositoris (r3data)."""

    key = "re3data"
    alt_key_match = "name"
    getter = get_re3data
    curation_header: ClassVar[Sequence[str]] = ("name", "homepage", "description")


if __name__ == "__main__":
    Re3dataAligner.cli()