Source code for bioregistry.external.prefixcommons

"""Download registry information from the Life Science Registry (LSR), which powers Prefix Commons.

.. seealso::

    - http://tinyurl.com/lsregistry which should expand to
      https://docs.google.com/spreadsheets/d/1cDGJcRteb9F5-jbw7Q7np0kk4hfWhdBHNYRIg3LXDrs/edit#gid=0
"""

from __future__ import annotations

import json
import logging
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, ClassVar

from pystow.utils import download

from bioregistry.constants import RAW_DIRECTORY
from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.license_standardizer import standardize_license

__all__ = [
    "PrefixCommonsAligner",
    "get_prefixcommons",
]

logger = logging.getLogger(__name__)

DIRECTORY = Path(__file__).parent.resolve()
RAW_PATH = RAW_DIRECTORY / "prefixcommons.tsv"
PROCESSED_PATH = DIRECTORY / "processed.json"
GOOGLE_DOCUMENT_ID = "1c4DmQqTGS4ZvJU_Oq2MFnLk-3UUND6pWhuMoP8jgZhg"
URL = f"https://docs.google.com/spreadsheets/d/{GOOGLE_DOCUMENT_ID}/export?format=tsv&gid=0"
COLUMNS = [
    "prefix",  # "Preferred Prefix",
    "synonyms",
    "rdf_uri_prefix",  # this is the RDF-useful version
    "alternate_uri_formats",  # these are alternative URI prefixes
    "MIRIAM",
    "BiodbcoreID",
    "bioportal",  # "BioPortal Ontology ID",
    "miriam",  # "identifiers.org",
    "Abbreviation",
    "name",  # originally: Title,
    "description",  # "Description",
    "pubmed_ids",  # "PubMed ID"
    "Organization",
    "Type (warehouse, dataset or terminology)",
    "keywords",
    "homepage",  # "Homepage",
    "Functional?",
    "part_of",  # sub-namespace in dataset
    "part of collection",
    "license_url",
    "License Text",
    "Rights",
    "pattern",  # "ID regex",
    "example",  # "ExampleID",
    "uri_format",  # "Provider HTML URL",
    "",
    "MIRIAM checked",
    "MIRIAM curator notes",
    "MIRIAM coverage",
    "updates",
    "year last accessible",
    "wayback url",
    "last updated",
    "last updated by",
    "last updated by (orcid)",
]
KEEP = {
    "prefix",
    "synonyms",
    "bioportal",
    "miriam",
    "name",
    "description",
    "pubmed_ids",
    "keywords",
    "homepage",
    "pattern",
    "example",
    "uri_format",
    "license_url",
    "alternate_uri_formats",
    "rdf_uri_prefix",
}
#: These contain synonyms with mismatches
DISCARD_SYNONYMS = {"biogrid", "cath", "zfa"}
SKIP_URI_FORMATS = {
    "http://purl.obolibrary.org/obo/$1",
    "http://www.ebi.ac.uk/ontology-lookup/?termId=$1",
    "http://arabidopsis.org/servlets/TairObject?accession=$1",
}



[docs]
def get_prefixcommons(
    force_download: bool = False, force_process: bool = False
) -> dict[str, dict[str, Any]]:
    """Get the Life Science Registry."""
    if PROCESSED_PATH.exists() and not (force_download or force_process):
        return load_processed(PROCESSED_PATH)

    download(url=URL, path=RAW_PATH, force=force_download)
    rows = {}
    with RAW_PATH.open() as file:
        lines = iter(file)
        next(lines)  # throw away header
        for line in lines:
            prefix, data = _process_row(line)
            if prefix and data:
                rows[prefix] = data

    PROCESSED_PATH.write_text(json.dumps(rows, sort_keys=True, indent=2))
    return rows



def _process_row(line: str) -> tuple[str, dict[str, Any]] | tuple[None, None]:
    cells = line.strip().split("\t")
    prefix = cells[0]
    cells_processed = [None if cell in {"N/A"} else cell for cell in cells]
    rv: dict[str, Any] = {
        key: value.strip()
        for key, value in zip(COLUMNS, cells_processed)
        if key and value and key in KEEP
    }
    for key in ["name", "description", "example", "pattern"]:
        if not rv.get(key):
            return None, None

    for key in ["keywords", "pubmed_ids"]:
        values = rv.get(key)
        if values:
            rv[key] = [value.strip() for value in values.split(",")]

    synonyms = rv.pop("synonyms", None)
    if not synonyms:
        pass
    elif prefix in DISCARD_SYNONYMS:
        pass
    else:
        synonyms_it = [s.strip() for s in synonyms.split(",")]
        synonyms_it = [
            synonym
            for synonym in synonyms_it
            if synonym.lower() != prefix.lower() and " " not in synonym
        ]
        if synonyms_it:
            rv["synonyms"] = synonyms_it

    license_url = rv.pop("license_url", None)
    if license_url:
        rv["license"] = standardize_license(license_url)

    uri_format = rv.pop("uri_format", None)
    if uri_format:
        uri_format = uri_format.replace("$id", "$1").replace("[?id]", "$1").replace("$d", "$1")
        if uri_format not in SKIP_URI_FORMATS:
            rv["uri_format"] = uri_format

    uri_rdf_formats = _get_uri_formats(rv, "rdf_uri_prefix")
    if uri_rdf_formats:
        if len(uri_rdf_formats) > 1:
            logger.warning("got multiple RDF formats for %s", prefix)
        rv["rdf_uri_format"] = uri_rdf_formats[0]

    alt_uri_formats_clean = _get_uri_formats(rv, "alternate_uri_formats")
    if alt_uri_formats_clean:
        rv["alt_uri_formats"] = alt_uri_formats_clean

    pattern = rv.get("pattern")
    if pattern:
        if not pattern.startswith("^"):
            pattern = f"^{pattern}"
        if not pattern.endswith("$"):
            pattern = f"{pattern}$"
        rv["pattern"] = pattern

    return prefix, rv


def _get_uri_formats(iv: dict[str, Any], key: str) -> list[str]:
    uri_formats: str | None = iv.pop(key, None)
    if not uri_formats:
        return []
    rv: list[str] = []
    for uri_format in uri_formats.split(","):
        uri_format = uri_format.strip()
        if not uri_format:
            continue
        if "identifiers.org" in uri_format:  # FIXME some non-miriam resources might use this
            continue
        if "obofoundry.org" in uri_format:  # FIXME some non-obo resources might use this
            continue
        if "obolibrary.org" in uri_format:  # FIXME take this check out
            continue
        if "$1" in uri_format or "[?id]" in uri_format:  # FIXME check if these come at the end
            continue
        uri_format = f"{uri_format}$1"
        if uri_format in SKIP_URI_FORMATS:
            continue
        rv.append(uri_format)
    return rv


SKIP = {
    "redidb": "Website is dead",
    "trnadbce": "Website is password protected",
    "pogs_plantrbp": "Website is dead",
    "smr": "no evidence of it existing",
}
PROVIDERS = {
    "homeodomain_resource": "hdr",
    "interpare": "pdb",
    "consurfdb": "pdb",
    "homstrad": "pdb",
    "jail": "pdb",
    "hotsprint": "pdb",
    "lpfc": "pdb",
    "pdbreprdb": "pdb",
    "pdtd": "pdb",
    "supersite": "pdb",
    "pairsdb": "pdb",
    "icbs": "pdb",
    "pdbbind": "pdb",
    "pdb.tm": "pdb",
    "ligasite": "pdb",
    "firedb": "pdb",
    "dali": "pdb",
    "pisite": "pdb",
    "procognate": "pdb",
    "binding_moad": "pdb",
    "bhfucl": "uniprot",
    "pdzbase": "uniprot",
    "unisave": "uniprot",
    "2dbaseecoli": "uniprot",
    "swiss2dpage": "uniprot",
    "siena2dpage": "uniprot",
    "phci2dpage": "uniprot",
    "reproduction2dpage": "uniprot",
    "agbase": "uniprot",
    "iproclass": "uniprot",
    "asap_ii": "unigene",
    "snp2nmd": "dbsnp",
    "cangem": "ensembl",
    "cisred": "ensembl",
    "interferome": "ensembl",
    "spliceinfo": "ensembl",
    "piggis": "ensembl",
    "corg": "ensembl",
    "greglist": "ensembl",
    "gxa": "ensembl",
    "cyclebase": "ensembl",
    "droid": "flybase",
    "enzyme": "eccode",
    "orenza": "eccode",
    "explorenz": "eccode",
    "fcp": "eccode",
    "mousecyc": "mgi",
    "imgt.3dstructuredb": "pdb",
    "mapu": "ipi",
    "sysbodyfluid": "ipi",
    "uniprot.taxonomy": "ncbitaxon",
    "domine": "pfam",
    "dima": "pfam",
    "interdom": "pfam",
    "sdr": "pfam",
    "ipfam": "pfam",
    "hupi": "hgnc.symbol",
    "chimerdb": "hgnc.symbol",
    "po.psds": "po",
    "cutdb": "pmap.cutdb",
    "hubmed": "pubmed",
}


class PrefixCommonsAligner(Aligner):
    """Aligner for Prefix Commons."""

    key = "prefixcommons"
    getter = get_prefixcommons
    curation_header: ClassVar[Sequence[str]] = (
        "name",
        "synonyms",
        "description",
        "example",
        "pattern",
        "uri_format",
    )
    alt_keys_match = "synonyms"
    include_new = False

    def get_skip(self) -> Mapping[str, str]:
        """Get skip prefixes."""
        return {**SKIP, **PROVIDERS}

    def get_curation_row(self, external_id: str, external_entry: dict[str, Any]) -> Sequence[str]:
        """Prepare curation rows for unaligned Prefix Commons registry entries."""
        return [
            external_entry["name"],
            ", ".join(external_entry.get("synonyms", [])),
            external_entry.get("description", "").replace('"', ""),
            external_entry.get("example", ""),
            external_entry.get("pattern", ""),
            external_entry.get("uri_format", ""),
        ]


if __name__ == "__main__":
    PrefixCommonsAligner.cli()