Source code for bioregistry.external.wikidata

# -*- coding: utf-8 -*-

"""Query, download, and format Wikidata as a registry."""

import json
import logging
from textwrap import dedent
from typing import Dict

import click

from bioregistry.constants import EXTERNAL, URI_FORMAT_KEY
from bioregistry.utils import query_wikidata, removeprefix

__all__ = [
    "get_wikidata",
]

DIRECTORY = EXTERNAL / "wikidata"
DIRECTORY.mkdir(exist_ok=True, parents=True)
RAW_PATH = DIRECTORY / "raw.json"
PROCESSED_PATH = DIRECTORY / "processed.json"

logger = logging.getLogger(__name__)

#: A query to wikidata for properties related to chemistry, biology, and related
QUERY = dedent(
    """\
    SELECT DISTINCT
      (?prop AS ?prefix)
      ?propLabel
      ?propDescription
      ?miriam
      ?pattern
      (GROUP_CONCAT(DISTINCT ?homepage_; separator='\\t') AS ?homepage)
      (GROUP_CONCAT(DISTINCT ?format_; separator='\\t') AS ?uri_format)
      (GROUP_CONCAT(DISTINCT ?format_rdf_; separator='\\t') AS ?uri_format_rdf)
      (GROUP_CONCAT(DISTINCT ?database_; separator='\\t') AS ?database)
      (GROUP_CONCAT(DISTINCT ?example_; separator='\\t') AS ?example)
      (GROUP_CONCAT(DISTINCT ?short_name_; separator='\\t') AS ?short_name)
    WHERE {
      VALUES ?category {
        wd:Q21294996  # chemistry
        wd:Q22988603  # biology
        wd:Q80840868  # research
      }
      ?prop wdt:P31/wdt:P279+ ?category .
      BIND( SUBSTR(STR(?prop), 32) AS ?propStr )
      OPTIONAL { ?prop wdt:P1793 ?pattern }
      OPTIONAL { ?prop wdt:P4793 ?miriam }

      OPTIONAL { ?prop wdt:P1813 ?short_name_ }
      OPTIONAL { ?prop wdt:P1896 ?homepage_ }
      OPTIONAL { ?prop wdt:P1630 ?format_ }
      OPTIONAL { ?prop wdt:P1921 ?format_rdf_ }
      OPTIONAL { ?prop wdt:P1629 ?database_ }
      OPTIONAL {
        ?prop p:P1855 ?statement .
        ?statement ?propQualifier ?example_ .
        FILTER (STRSTARTS(STR(?propQualifier), "http://www.wikidata.org/prop/qualifier/"))
        FILTER (?propStr = SUBSTR(STR(?propQualifier), 40))
      }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
    GROUP BY ?prop ?propLabel ?propDescription ?miriam ?pattern
    ORDER BY ?prop
    """
)

RENAMES = {"propLabel": "name", "propDescription": "description"}
CANONICAL_DATABASES = {
    "P6800": "Q87630124",  # -> NCBI Genome
    "P627": "Q48268",  # -> International Union for Conservation of Nature
    "P351": "Q1345229",  # NCBI Gene
    "P4168": "Q112783946",  # Immune epitope database
}

CANONICAL_HOMEPAGES: Dict[str, str] = {}
CANONICAL_URI_FORMATS = {
    "P830": "https://eol.org/pages/$1",
    "P2085": "https://jglobal.jst.go.jp/en/redirect?Nikkaji_No=$1",
}

# Stuff with miriam IDs that shouldn't
MIRIAM_BLACKLIST = {
    "Q51162088",
    "Q56221155",
    "Q47519952",
    "Q106201514",
    "Q106201090",
    "Q106201991",
    "Q106201090",
    "Q106201514",
    "Q106201904",
    "Q106201991",
    "Q106832467",
    "Q96212863",
    "Q106695243",
    "Q51162088",
    "Q56221155",
    "Q47519952",
}


def _get_wikidata():
    """Iterate over Wikidata properties connected to biological databases."""
    rv = {}
    for bindings in query_wikidata(QUERY):
        examples = bindings.get("example", {}).get("value", "").split("\t")
        if examples and all(
            example.startswith("http://www.wikidata.org/entity/") for example in examples
        ):
            # This is a relationship
            continue

        bindings = {
            RENAMES.get(key, key): value["value"]
            for key, value in bindings.items()
            if value["value"]
        }

        prefix = bindings["prefix"] = removeprefix(
            bindings["prefix"], "http://www.wikidata.org/entity/"
        )
        for key in [
            "homepage",
            "uri_format_rdf",
            URI_FORMAT_KEY,
            "database",
            "example",
            "short_name",
        ]:
            if key in bindings:
                bindings[key] = tuple(
                    sorted(
                        removeprefix(value, "http://www.wikidata.org/entity/")
                        for value in bindings[key].split("\t")
                    )
                )

        for key, canonicals in [
            ("database", CANONICAL_DATABASES),
            ("homepage", CANONICAL_HOMEPAGES),
            ("uri_format", CANONICAL_URI_FORMATS),
        ]:
            values = bindings.get(key, [])
            if not values:
                pass
            elif len(values) == 1:
                bindings[key] = values[0]
            elif prefix not in canonicals:
                logger.warning(f"need to curate canonical {key} for {prefix}: {', '.join(values)}")
                bindings[key] = values[0]
            else:
                bindings[key] = canonicals[prefix]

        pattern = bindings.get("pattern")
        if pattern:
            if not pattern.startswith("^"):
                pattern = "^" + pattern
            if not pattern.endswith("$"):
                pattern = pattern + "$"
            bindings["pattern"] = pattern

        rv[prefix] = bindings

    return rv


[docs]def get_wikidata(force_download: bool = False): """Get the wikidata registry.""" if PROCESSED_PATH.exists() and not force_download: with PROCESSED_PATH.open() as file: return json.load(file) data = _get_wikidata() with RAW_PATH.open("w") as file: json.dump(data, file, indent=2, sort_keys=True) return data
@click.command() def _main(): data = get_wikidata(force_download=True) click.echo(f"Got {len(data):,} records") if __name__ == "__main__": _main()