Source code for bioregistry.external.uniprot

"""Download and parse the UniProt Cross-ref database."""

from __future__ import annotations

import json
import logging
from collections.abc import Sequence
from pathlib import Path
from typing import Any, ClassVar

import requests

from bioregistry.constants import RAW_DIRECTORY, URI_FORMAT_KEY
from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.utils import removeprefix

__all__ = [
    "UniProtAligner",
    "get_uniprot",
]

logger = logging.getLogger(__name__)

#: Download URL for the UniProt registry
URL = "https://rest.uniprot.org/database/stream?format=json&query=*"
DIRECTORY = Path(__file__).parent.resolve()
RAW_PATH = RAW_DIRECTORY / "uniprot.json"
PROCESSED_PATH = DIRECTORY / "processed.json"

#: resources with these UniProt prefixes don't exist anymore
skip_prefixes = {
    "UniPathway",  # doesn't exist anymore
    "BRENDA",  # has bad format string contains EC, UniProt, and taxon
    "eggNOG",  # not sure what this does
    "PlantReactome",  # incomprehensible URLs
    "Reactome",  # incomprehensible URLs
}


[docs] def get_uniprot(*, force_download: bool = True) -> dict[str, dict[str, str]]: """Get the UniProt registry.""" if PROCESSED_PATH.is_file() and not force_download: return load_processed(PROCESSED_PATH) RAW_PATH.write_text( json.dumps( requests.get(URL, timeout=30).json(), indent=2, sort_keys=True, ensure_ascii=False ) ) rv = {} for record in json.loads(RAW_PATH.read_text())["results"]: processed_record = _process_record(record) if processed_record is None: continue prefix = processed_record.pop("prefix") if prefix in skip_prefixes: continue rv[prefix] = processed_record with PROCESSED_PATH.open("w") as file: json.dump(rv, file, indent=2, sort_keys=True) return rv
def _process_record(record: dict[str, Any]) -> dict[str, Any] | None: rv = { "prefix": record.pop("id"), "name": record.pop("name"), "abbreviation": record.pop("abbrev"), "homepage": record.pop("servers")[0], "category": record.pop("category"), } publication = {} doi: str | None = record.pop("doiId", None) if doi is not None: doi = doi.lower().rstrip(".") doi = removeprefix(doi, "doi:") doi = removeprefix(doi, "https://doi.org/") if "/" in doi: publication["doi"] = doi pubmed = record.pop("pubMedId", None) if pubmed: publication["pubmed"] = str(pubmed) if publication: rv["publications"] = [publication] del record["linkType"] del record["statistics"] rv = {k: v for k, v in rv.items() if k and v} value = record.pop("dbUrl") if "%s" in value and "%u" in value: logger.debug(f"has both formats: {value}") return None else: value = value.replace("%s", "$1").replace("%u", "$1") if "$1" in value: rv[URI_FORMAT_KEY] = value else: logger.debug("no annotation in %s", rv["prefix"]) if record: logger.debug("forgot something: %s", record) return rv class UniProtAligner(Aligner): """Aligner for UniProt.""" key = "uniprot" alt_key_match = "abbreviation" getter = get_uniprot curation_header: ClassVar[Sequence[str]] = ("abbreviation", "name", URI_FORMAT_KEY, "category") if __name__ == "__main__": UniProtAligner.cli()