"""Query, download, and format Wikidata as a registry."""
import json
import logging
from collections.abc import Iterable, Mapping, Sequence
from pathlib import Path
from textwrap import dedent
from typing import Any, ClassVar
from bioregistry.constants import BIOREGISTRY_PATH, URI_FORMAT_KEY
from bioregistry.external.alignment_utils import Aligner, load_processed
from bioregistry.utils import query_wikidata, removeprefix
__all__ = [
"WikidataAligner",
"get_wikidata",
]
logger = logging.getLogger(__name__)
DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"
PROPERTIES_QUERY = dedent(
"""\
SELECT ?propStr
WHERE {
VALUES ?category {
wd:Q21294996 # chemistry
wd:Q22988603 # biology
wd:Q80840868 # research
}
?prop wdt:P31/wdt:P279+ ?category .
BIND( SUBSTR(STR(?prop), 32) AS ?propStr )
}
ORDER BY ?prop
"""
)
#: A query to wikidata for properties related to chemistry, biology, and related
QUERY_FMT = dedent(
"""\
SELECT DISTINCT
(?prop AS ?prefix)
?propLabel
?propDescription
?miriam
?pattern
(GROUP_CONCAT(DISTINCT ?homepage_; separator='\\t') AS ?homepage)
(GROUP_CONCAT(DISTINCT ?format_; separator='\\t') AS ?uri_format)
(GROUP_CONCAT(DISTINCT ?format_rdf_; separator='\\t') AS ?uri_format_rdf)
(GROUP_CONCAT(DISTINCT ?database_; separator='\\t') AS ?database)
(GROUP_CONCAT(DISTINCT ?example_; separator='\\t') AS ?example)
(GROUP_CONCAT(DISTINCT ?short_name_; separator='\\t') AS ?short_name)
WHERE {
{
VALUES ?category {
wd:Q21294996 # chemistry
wd:Q22988603 # biology
wd:Q80840868 # research
}
?prop wdt:P31/wdt:P279+ ?category .
}
UNION {
VALUES ?prop { %s }
}
BIND( SUBSTR(STR(?prop), 32) AS ?propStr )
OPTIONAL { ?prop wdt:P1793 ?pattern }
OPTIONAL { ?prop wdt:P4793 ?miriam }
OPTIONAL { ?prop wdt:P1813 ?short_name_ }
OPTIONAL { ?prop wdt:P1896 ?homepage_ }
OPTIONAL { ?prop wdt:P1630 ?format_ }
OPTIONAL { ?prop wdt:P1921 ?format_rdf_ }
OPTIONAL { ?prop wdt:P1629 ?database_ }
OPTIONAL {
?prop p:P1855 ?statement .
?statement ?propQualifier ?example_ .
FILTER (STRSTARTS(STR(?propQualifier), "http://www.wikidata.org/prop/qualifier/"))
FILTER (?propStr = SUBSTR(STR(?propQualifier), 40))
}
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?prop ?propLabel ?propDescription ?miriam ?pattern
ORDER BY ?prop
"""
)
SKIP = {
"P3205": "is a relationship",
"P3781": "is a relationship",
"P4545": "is a relationship",
"P3190": "is a relationship",
"P4954": "is a relationship",
"P4000": "is a relationship",
"P3189": "is a relationship",
"P3310": "is a relationship",
"P3395": "is a data property",
"P3387": "is a data property",
"P3337": "is a data property",
"P3485": "is a data property",
"P3486": "is a data property",
"P10322": "is a data property",
"P10630": "is a data property",
"P1193": "is a data property",
"P1603": "is a data property",
"P2067": "is a data property",
"P2844": "is a data property",
"P2854": "is a data property",
"P3487": "is a data property",
"P3492": "is a data property",
"P4214": "is a data property",
"P3488": "is a data property",
"P4250": "is a data property",
"P574": "is a data property",
"P7770": "is a data property",
"P783": "is a data property",
"P7862": "is a data property",
"P8010": "is a data property",
"P8011": "is a data property",
"P8049": "is a data property",
"P8556": "is a data property",
"P9107": "is a data property",
"Q112586709": "should not be annotated like a property",
"Q111831044": "should not be annotated like a property",
"Q115916376": "should not be annotated like a property",
"P1104": "is a data property",
"P10676": "is a data property",
"P181": "is a data property",
"P1843": "is a data property",
"P225": "is a data property",
"P3752": "is a data property",
"P8558": "is a data property",
"P6507": "is a data property",
"P428": "is a data property",
}
RENAMES = {"propLabel": "name", "propDescription": "description"}
CANONICAL_DATABASES = {
"P6800": "Q87630124", # -> NCBI Genome
"P627": "Q48268", # -> International Union for Conservation of Nature
"P351": "Q1345229", # NCBI Gene
"P4168": "Q112783946", # Immune epitope database
}
CANONICAL_HOMEPAGES: dict[str, str] = {
"P6852": "https://www.ccdc.cam.ac.uk",
"P7224": "http://insecta.pro/catalog",
"P1761": "http://delta-intkey.com",
"P2083": "http://www.leadscope.com",
"P7965": "https://www.scilit.net",
"P7963": "https://github.com/obophenotype/cell-ontology",
"P2275": "http://www.who.int/medicines/services/inn/en/",
"P10246": "https://medlineplus.gov/druginfo/herb_All.html",
"P10245": "https://medlineplus.gov/druginfo/drug_Aa.html",
"P9704": "https://www.monumentaltrees.com/en/",
"P9356": "http://portal.hymao.org/projects/32/public/label/list_all",
"P3088": "https://taibnet.sinica.edu.tw/home_eng.php",
"P486": "http://www.nlm.nih.gov",
}
CANONICAL_URI_FORMATS = {
"P830": "https://eol.org/pages/$1",
"P2085": "https://jglobal.jst.go.jp/en/redirect?Nikkaji_No=$1",
"P604": "https://medlineplus.gov/ency/article/$1.htm",
"P492": "https://omim.org/OMIM:$1",
"P486": "http://www.nlm.nih.gov",
"P3201": "http://bioportal.bioontology.org/ontologies/MEDDRA?p=classes&conceptid=$1",
"P7224": "http://insecta.pro/taxonomy/$1",
"P3088": "https://taibnet.sinica.edu.tw/eng/taibnet_species_detail.php?name_code=$1",
"P8150": "https://search.bvsalud.org/global-literature-on-novel-coronavirus-2019-ncov/resource/en/$1",
"P9272": "https://decs.bvsalud.org/ths/resource/?id=$1",
"P8082": "https://www.mscbs.gob.es/ciudadanos/centros.do?metodo=realizarDetalle&tipo=hospital&numero=$1",
"P10095": "https://www.surgeons.org/Profile/$1",
"P5397": "http://www.tierstimmen.org/en/database?field_spec_species_target_id_selective=$1",
"P7471": "https://www.inaturalist.org/places/$1",
"P696": "https://scicrunch.org/scicrunch/interlex/view/ilx_$1",
}
CANONICAL_RDF_URI_FORMATS: dict[str, str] = {}
# Stuff with miriam IDs that shouldn't
MIRIAM_BLACKLIST = {
"Q106201090",
"Q106201514",
"Q106201904",
"Q106201991",
"Q106695243",
"Q106832467",
"Q47519952",
"Q51162088",
"Q56221155",
"Q96212863",
}
def _get_mapped() -> set[str]:
return {
value
for record in json.loads(BIOREGISTRY_PATH.read_text()).values()
for metaprefix, value in record.get("mappings", {}).items()
if metaprefix == "wikidata"
}
def _get_query(properties: Iterable[str]) -> str:
values = " ".join(f"wd:{p}" for p in properties)
return QUERY_FMT % values
def _get_wikidata() -> dict[str, dict[str, Any]]:
"""Iterate over Wikidata properties connected to biological databases."""
mapped = _get_mapped()
# throw out anything that can be queried directly
mapped.difference_update(
bindings["propStr"]["value"]
for bindings in query_wikidata(PROPERTIES_QUERY)
if bindings["propStr"]["value"].startswith("P") # throw away any regular ones
)
rv = {}
for bindings in query_wikidata(_get_query(mapped)):
bindings = {
RENAMES.get(key, key): value["value"]
for key, value in bindings.items()
if value["value"]
}
prefix = bindings["prefix"] = removeprefix(
bindings["prefix"], "http://www.wikidata.org/entity/"
)
if prefix in SKIP or not prefix:
continue
examples = bindings.get("example", "").split("\t")
if examples and all(
example.startswith("http://www.wikidata.org/entity/") for example in examples
):
# This is a relationship
continue
for key in [
"homepage",
"uri_format_rdf",
URI_FORMAT_KEY,
"database",
"example",
"short_name",
]:
if key in bindings:
bindings[key] = tuple(
sorted(
removeprefix(value, "http://www.wikidata.org/entity/")
for value in bindings[key].split("\t")
)
)
for key in ["uri_format_rdf", URI_FORMAT_KEY]:
if key in bindings:
bindings[key] = tuple(
k for k in bindings[key] if k != "http://purl.obolibrary.org/obo/$1"
)
# remove URNs
bindings["uri_format_rdf"] = [
uri_format_rdf
for uri_format_rdf in bindings.get("uri_format_rdf", [])
if not uri_format_rdf.startswith("urn:")
]
for key, canonicals in [
("database", CANONICAL_DATABASES),
("homepage", CANONICAL_HOMEPAGES),
("uri_format", CANONICAL_URI_FORMATS),
("uri_format_rdf", CANONICAL_RDF_URI_FORMATS),
]:
# sort by increasing length - the assumption being that the shortest
# one has the least amount of nonsense, like language tags or extra
# parameters
values = sorted(bindings.get(key, []), key=len)
if not values:
pass
elif len(values) == 1:
bindings[key] = values[0]
elif prefix not in canonicals:
logger.warning(
"[wikidata] need to curate canonical %s for %s (%s):",
key,
prefix,
bindings["name"],
)
for value in values:
logger.warning(" %s", value)
bindings[key] = values[0]
else:
bindings[key] = canonicals[prefix]
pattern = bindings.get("pattern")
if pattern:
if not pattern.startswith("^"):
pattern = "^" + pattern
if not pattern.endswith("$"):
pattern = pattern + "$"
bindings["pattern"] = pattern
rv[prefix] = {k: v for k, v in bindings.items() if k and v}
return rv
[docs]
def get_wikidata(force_download: bool = False) -> dict[str, dict[str, Any]]:
"""Get the wikidata registry."""
if PROCESSED_PATH.exists() and not force_download:
return load_processed(PROCESSED_PATH)
data = _get_wikidata()
with PROCESSED_PATH.open("w") as file:
json.dump(data, file, indent=2, sort_keys=True)
return data
# Unlike the other aligners, the wikidata one doesn't really do the job of making the alignment.
# It's more of a stand-in and curation sheet generator right now.
class WikidataAligner(Aligner):
"""Aligner for Wikidata properties."""
key = "wikidata"
getter = get_wikidata
curation_header: ClassVar[Sequence[str]] = (
"name",
"homepage",
"description",
"uri_format",
"example",
)
def get_skip(self) -> Mapping[str, str]:
"""Get entries to skip."""
return SKIP
if __name__ == "__main__":
WikidataAligner.cli()