Source code for bioregistry.external.re3data

"""Re3data is a registry of research data repositories.

Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772
"""

import json
import logging
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar, Optional
from xml.etree import ElementTree

import requests
from tqdm.contrib.concurrent import thread_map

from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.utils import removeprefix

__all__ = [
    "Re3dataAligner",
    "get_re3data",
]

logger = logging.getLogger(__name__)
DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"

BASE_URL = "https://www.re3data.org"
SCHEMA = "{http://www.re3data.org/schema/2-2}"



[docs]
def get_re3data(force_download: bool = False) -> dict[str, dict[str, Any]]:
    """Get the re3data registry.

    This takes about 9 minutes since it has to look up each of the ~3K
    records with their own API call.

    :param force_download: If true, re-downloads the data
    :returns: The re3data pre-processed data
    """
    if PROCESSED_PATH.exists() and not force_download:
        return load_processed(PROCESSED_PATH)

    res = requests.get(f"{BASE_URL}/api/v1/repositories", timeout=15)
    tree = ElementTree.fromstring(res.text)

    identifier_to_doi = {}
    for repository in tree.findall("repository"):
        identifier_element = repository.find("id")
        if identifier_element is None or identifier_element.text is None:
            continue

        doi_element = repository.find("doi")
        doi = (
            removeprefix(doi_element.text, "https://doi.org/")
            if doi_element is not None and doi_element.text
            else None
        )
        identifier_to_doi[identifier_element.text.strip()] = doi

    records = dict(
        thread_map(  # type:ignore
            _get_record,
            identifier_to_doi,
            unit_scale=True,
            unit="record",
            desc="Getting re3data",
            disable=True,
        )
    )

    # backfill DOIs
    for identifier, record in records.items():
        doi = identifier_to_doi.get(identifier)
        if doi:
            record["doi"] = doi

    with PROCESSED_PATH.open("w") as file:
        json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False)

    return records



def _get_record(identifier: str) -> tuple[str, Mapping[str, Any]]:
    res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}", timeout=15)
    tree = ElementTree.fromstring(res.text)[0]
    return identifier, _process_record(identifier, tree)


def _process_record(identifier: str, tree_inner: ElementTree.Element) -> dict[str, Any]:
    xrefs = (
        _clean_xref(element.text.strip())
        for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier")
        if element.text is not None
    )
    data = {
        "prefix": identifier,
        "name": tree_inner.findtext(f"{SCHEMA}repositoryName"),
        "description": tree_inner.findtext(f"{SCHEMA}description"),
        "homepage": tree_inner.findtext(f"{SCHEMA}repositoryURL"),
        "synonyms": [
            element.text.strip()
            for element in tree_inner.findall(f"{SCHEMA}additionalName")
            if element.text is not None
        ],
        "xrefs": dict(tup for tup in xrefs if tup),
    }

    license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName")
    if license_element is not None:
        data["license"] = license_element.text

    return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v}


def _clean_xref(xref: str) -> Optional[tuple[str, str]]:
    if (
        xref.startswith("FAIRsharing_DOI:10.25504/")
        or xref.startswith("FAIRsharing_doi:10.25504/")
        or xref.startswith("FAIRsharing_dOI:10.25504/")
        or xref.startswith("FAIRSharing_doi:10.25504/")
        or xref.startswith("FAIRsharing_doi;10.25504/")
        or xref.startswith("FAIRsharing_doi: 10.25504/")
        or xref.startswith("fairsharing_DOI:10.25504/")
        or xref.startswith("fairsharing_doi:10.25504/")
        or xref.startswith("FAIRsharin_doi:10.25504/")
        or xref.startswith("FAIRsharing_doi.:10.25504/")
        or xref.startswith("FAIRsharing_DOI: 10.25504/")
        or xref.startswith("FAIRsharing_doi::10.25504/")
        or xref.startswith("FAIRsharing_doi:10.24404/")
    ):
        return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :]

    for start, key in [
        ("biodbcore-", "biodbcore"),
        ("MIR:", "miriam"),
        ("ROR:", "ror"),
        ("OMICS_", "omics"),
        ("Omics_", "omics"),
        ("omics_", "omics"),
        ("ISSN ", "issn"),
        ("ISSN: ", "issn"),
        ("nif-", "nif"),
        ("ISNI:", "isni"),
        ("doi.org/", "doi"),
        ("doi:", "doi"),
        ("DOI:", "doi"),
        ("DOI: ", "doi"),
        ("RID:nlx_", "nlx"),
        ("PSSB-", "pssb"),
        ("OpenDOAR:", "opendoar"),
        ("openDOAR:", "opendoar"),
        ("ROAR:", "roar"),  # e.g., see http://roar.eprints.org/14208/
        ("hdl:", "hdl"),
        ("https://fairsharing.org/", "fairsharing.legacy"),
        ("http://fairsharing.org/", "fairsharing.legacy"),
        ("Wikidata:", "wikidata"),
        ("https://doi.org/10.5281/zenodo.", "zenodo"),
        ("https://doi.org/", "doi"),
    ]:
        if xref.startswith(start):
            return key, xref[len(start) :]

    if xref.startswith("RRID:"):
        inner_xref = xref[len("RRID:") :]
        if "_" in inner_xref:
            prefix, identifier = inner_xref.split("_", 1)
            return prefix.lower(), identifier
        elif "-" in inner_xref:
            try:
                prefix, identifier = inner_xref.split("-", 1)
            except ValueError:
                logger.debug("can't parse RRID: %s", xref)
            else:
                return prefix.lower(), identifier
        else:
            logger.debug("unknown RRID: %s", xref)
            return None

    if "doi:" in xref:
        for part in xref.split(" "):
            if part.startswith("doi"):
                return "doi", part[len("doi:") :]

    logger.debug("re3data record had unparsable xref: %s", xref)
    return None


class Re3dataAligner(Aligner):
    """Aligner for the Registry of Research Data Repositoris (r3data)."""

    key = "re3data"
    alt_key_match = "name"
    getter = get_re3data
    curation_header: ClassVar[Sequence[str]] = ("name", "homepage", "description")


if __name__ == "__main__":
    Re3dataAligner.cli()