Source code for bioregistry.external.rrid

"""A source for the SciCrunch Registry (SCR).

The SciCrunch static data was suggested by Anita Bandrowski in
https://github.com/biopragmatics/bioregistry/issues/949#issuecomment-1747702117.
Based on the name, it was likely exported on August 24th, 2023. It can be accessed at
https://docs.google.com/spreadsheets/d/1BEPZXZsENhK7592AR83xUwbPbR2J-GVQ/edit?\
usp=sharing&ouid=107737386203376389514&rtpof=true&sd=true.
"""

import csv
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import ClassVar

from bioregistry.constants import RAW_DIRECTORY
from bioregistry.external.alignment_utils import Aligner

__all__ = [
    "RRIDAligner",
    "get_rrid",
]


HERE = Path(__file__).parent.resolve()
PATH = RAW_DIRECTORY.joinpath("rrid.tsv")

COLUMN_RENAMES = {"Resource_Name": "name"}
skip = {"RIN", "Resource Information Network"}

#: FIXME - see https://github.com/biopragmatics/bioregistry/issues/954
UNCURATABLE = {
    "XEP": "could not find an example entity number",
    "CWRU": "could not find evidence that this is an identifier resource",
    "XGSC": "could not find evidence that this is an identifier resource",
    "SSCLBR": "dead resource",
    "EXRC": "resource does not have stable/referencable identifiers for entities",
    "IMSR": "meta-site that seems to wrap other IMSR sites",
    "IMSR_CARD": "dead website",
    "IMSR_CMMR": "just a wrapper around MGI",
    "IMSR_CRL": "Massive site, too cryptic, can't find",
    "IMSR_GPT": "actual URLs don't match accession numbers",
    "IMSR_HAR": "could not find evidence that this is an identifier resource",
    "IMSR_NM-KI": "multiple conflicting identifiers - actual URLs don't match accession numbers",
    "IMSR_NIG": "could not find evidence that this is an identifier resource",
    "IMSR_TIGM": "could not find evidence that this is an identifier resource",
}


[docs] def get_rrid(*, force_download: bool = False) -> dict[str, dict[str, str]]: """Get RRIDs.""" rv = {} with PATH.open() as file: reader = csv.DictReader(file, delimiter="\t") for record in reader: rrid_pattern = record.get("RRID_Identifier_Pattern") if not rrid_pattern or not rrid_pattern.startswith("RRID:"): continue prefix = rrid_pattern[len("RRID:") :].rstrip("_") ddd = { "name": record["Resource_Name"], "homepage": record["Resource_URL"], "scr": record["scr_id"][len("SCR_") :], # "uri_format": f"https://scicrunch.org/resolver/RRID:{prefix}_$1", } pubmeds = [ x[len("PMID:") :] for x in _split(record["Defining_Citation"]) if x.startswith("PMID:") ] if pubmeds: ddd["pubmeds"] = pubmeds keywords = sorted(set(_split(record["Keywords"])).difference(skip)) if keywords: ddd["keywords"] = keywords abbreviation = record["Abbreviation"] if abbreviation and abbreviation != prefix: ddd["abbreviation"] = abbreviation twitter = record["Twitter_Handle"] if twitter: ddd["twitter"] = twitter.lstrip("@") # could get license rv[prefix] = ddd return rv
def _split(s: str) -> list[str]: return [c.strip() for c in s.split(",")] class RRIDAligner(Aligner): """Aligner for the RRID.""" key = "rrid" getter = get_rrid alt_key_match = "abbreviation" curation_header: ClassVar[Sequence[str]] = ("name", "homepage") def get_skip(self) -> Mapping[str, str]: """Get prefixes to skip.""" return UNCURATABLE if __name__ == "__main__": RRIDAligner.cli()