Source code for bioregistry.external.uniprot

# -*- coding: utf-8 -*-

"""Download and parse the UniProt Cross-ref database."""

import json
import logging
from typing import Mapping

from defusedxml import ElementTree
from pystow.utils import download

from bioregistry.constants import EXTERNAL, URI_FORMAT_KEY

__all__ = [
    "get_uniprot",
]

logger = logging.getLogger(__name__)

#: Download URL for the UniProt registry
URL = "https://rest.uniprot.org/database/stream?format=rdf&query=*"
DIRECTORY = EXTERNAL / "uniprot"
DIRECTORY.mkdir(exist_ok=True, parents=True)
RAW_PATH = DIRECTORY / "raw.xml"
PROCESSED_PATH = DIRECTORY / "processed.json"

PREFIX = "{http://purl.uniprot.org/core/}abbreviation"

kz = {
    "identifier": "{http://purl.org/dc/terms/}identifier",
    "name": "{http://www.w3.org/2000/01/rdf-schema#}label",
    "type": "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type",
    "primary_topic_of": "{http://xmlns.com/foaf/0.1/}primaryTopicOf",
    "category": "{http://purl.uniprot.org/core/}category",
    "link_is_explicit": "{http://purl.uniprot.org/core/}linkIsExplicit",
    "see_also": "{http://www.w3.org/2000/01/rdf-schema#}seeAlso",
    URI_FORMAT_KEY: "{http://purl.uniprot.org/core/}urlTemplate",
    "citation": "{http://purl.uniprot.org/core/}citation",
    "exact_match": "{http://www.w3.org/2004/02/skos/core#}exactMatch",
    "comment": "{http://www.w3.org/2000/01/rdf-schema#}comment",
}
kzi = {v: k for k, v in kz.items()}

#: resources with these UniProt prefixes don't exist anymore
skip_prefixes = {
    "UniPathway",  # doesn't exist anymore
    "BRENDA",  # has bad format string contains EC, UniProt, and taxon
    "eggNOG",  # not sure what this does
    "PlantReactome",  # incomprehensible URLs
    "Reactome",  # incomprehensible URLs
}


[docs]def get_uniprot(force_download: bool = True) -> Mapping[str, Mapping[str, str]]: """Get the UniProt registry.""" if PROCESSED_PATH.is_file() and not force_download: with PROCESSED_PATH.open() as file: return json.load(file) download(url=URL, path=RAW_PATH, force=True) with RAW_PATH.open() as file: tree = ElementTree.parse(file) root = tree.getroot() rv = {} for element in root.findall("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description"): prefix = element.findtext(PREFIX) if prefix in skip_prefixes: continue entry = dict(prefix=prefix) for key, path in kz.items(): value = element.findtext(path) if not value: continue if key == URI_FORMAT_KEY: if "%s" in value and "%u" in value: logger.warning(f"{prefix} has both formats: {value}") pass # FIXME else: value = value.replace("%s", "$1").replace("%u", "$1") entry[key] = value prefix = entry.get("prefix") if prefix is not None: rv[prefix] = entry with PROCESSED_PATH.open("w") as file: json.dump(rv, file, indent=2, sort_keys=True) return rv
if __name__ == "__main__": get_uniprot(force_download=True)