Source code for bioregistry.external.uniprot

# -*- coding: utf-8 -*-

"""Download and parse the UniProt Cross-ref database."""

import json
import logging
from typing import Mapping

from defusedxml import ElementTree
from pystow.utils import download

from bioregistry.constants import EXTERNAL, URI_FORMAT_KEY

__all__ = [

logger = logging.getLogger(__name__)

#: Download URL for the UniProt registry
URL = "*"
DIRECTORY.mkdir(exist_ok=True, parents=True)
RAW_PATH = DIRECTORY / "raw.xml"
PROCESSED_PATH = DIRECTORY / "processed.json"

PREFIX = "{}abbreviation"

kz = {
    "identifier": "{}identifier",
    "name": "{}label",
    "type": "{}type",
    "primary_topic_of": "{}primaryTopicOf",
    "category": "{}category",
    "link_is_explicit": "{}linkIsExplicit",
    "see_also": "{}seeAlso",
    URI_FORMAT_KEY: "{}urlTemplate",
    "citation": "{}citation",
    "exact_match": "{}exactMatch",
    "comment": "{}comment",
kzi = {v: k for k, v in kz.items()}

#: resources with these UniProt prefixes don't exist anymore
skip_prefixes = {
    "UniPathway",  # doesn't exist anymore
    "BRENDA",  # has bad format string contains EC, UniProt, and taxon
    "eggNOG",  # not sure what this does
    "PlantReactome",  # incomprehensible URLs
    "Reactome",  # incomprehensible URLs

[docs]def get_uniprot(force_download: bool = True) -> Mapping[str, Mapping[str, str]]: """Get the UniProt registry.""" if PROCESSED_PATH.is_file() and not force_download: with as file: return json.load(file) download(url=URL, path=RAW_PATH, force=True) with as file: tree = ElementTree.parse(file) root = tree.getroot() rv = {} for element in root.findall("{}Description"): prefix = element.findtext(PREFIX) if prefix in skip_prefixes: continue entry = dict(prefix=prefix) for key, path in kz.items(): value = element.findtext(path) if not value: continue if key == URI_FORMAT_KEY: if "%s" in value and "%u" in value: logger.warning(f"{prefix} has both formats: {value}") pass # FIXME else: value = value.replace("%s", "$1").replace("%u", "$1") entry[key] = value prefix = entry.get("prefix") if prefix is not None: rv[prefix] = entry with"w") as file: json.dump(rv, file, indent=2, sort_keys=True) return rv
if __name__ == "__main__": get_uniprot(force_download=True)