# -*- coding: utf-8 -*-
"""Query, download, and format Wikidata as a registry."""
import json
import logging
from collections import defaultdict
from textwrap import dedent
from bioregistry.constants import EXTERNAL, URI_FORMAT_KEY
from bioregistry.utils import query_wikidata
__all__ = [
"get_wikidata",
]
DIRECTORY = EXTERNAL / "wikidata"
DIRECTORY.mkdir(exist_ok=True, parents=True)
RAW_PATH = DIRECTORY / "raw.json"
PROCESSED_PATH = DIRECTORY / "processed.json"
logger = logging.getLogger(__name__)
HEADER = {
# 'database',
"databaseLabel": "database.label",
"databaseMiriam": "database.miriam",
"databaseHomepage": "database.homepage",
"prop": "prefix",
"propLabel": "name",
"propDescription": "description",
"propMiriam": "miriam",
"propHomepage": "homepage",
"propFormat": URI_FORMAT_KEY,
"propFormatRDF": "format.rdf",
"propPattern": "pattern",
}
def iter_results():
"""Iterate over Wikidata properties connected to biological databases."""
query = dedent(
"""\
SELECT
?database ?databaseLabel ?databaseMiriam ?databaseHomepage
?prop ?propLabel ?propDescription ?propMiriam ?propHomepage ?propFormat ?propFormatRDF ?propPattern
# ?propDatabase ?propDatabaseLabel
WHERE {
?database wdt:P31 wd:Q4117139 .
?database wdt:P1687 ?prop .
OPTIONAL { ?database wdt:P856 ?databaseHomepage } .
OPTIONAL { ?database wdt:P4793 ?databaseMiriam } .
OPTIONAL { ?prop wdt:P4793 ?propMiriam } .
OPTIONAL { ?prop wdt:P1630 ?propFormat } .
OPTIONAL { ?prop wdt:P1921 ?propFormatRDF } .
OPTIONAL { ?prop wdt:P1793 ?propPattern } .
OPTIONAL { ?prop wdt:P1896 ?propHomepage } .
OPTIONAL { ?prop wdt:P1629 ?propDatabase } .
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
ORDER BY DESC(?databaseLabel)
"""
)
# Q4117139 "biological database"
# P31 "instance of"
# P1687 "wikidata property" <- meta property
for bindings in query_wikidata(query):
for url in ["prop", "propDatabase", "database"]:
if url in bindings:
bindings[url]["value"] = bindings[url]["value"].split("/")[-1]
yield {key: value["value"] for key, value in bindings.items()}
# Stuff with miriam IDs that shouldn't
MIRIAM_BLACKLIST = {
"Q51162088",
"Q56221155",
"Q47519952",
"Q106201514",
"Q106201090",
"Q106201991",
"Q106201090",
"Q106201514",
"Q106201904",
"Q106201991",
"Q106832467",
"Q96212863",
"Q106695243",
"Q51162088",
"Q56221155",
"Q47519952",
}
[docs]def get_wikidata(force_download: bool = False):
"""Get the wikidata registry."""
if PROCESSED_PATH.exists() and not force_download:
with PROCESSED_PATH.open() as file:
return json.load(file)
data = list(iter_results())
with RAW_PATH.open("w") as file:
json.dump(data, file, indent=2, sort_keys=True)
agg1 = defaultdict(list)
for record in data:
agg1[record["prop"]].append(record)
agg2 = {key: _aggregate(key, values) for key, values in agg1.items()}
rv = {key: _process(record) for key, record in agg2.items()}
with PROCESSED_PATH.open("w") as file:
json.dump(rv, file, indent=2, sort_keys=True)
return rv
CANONICAL_DATABASES = {
"P6800": "Q87630124", # -> NCBI Genome
"P627": "Q48268", # -> International Union for Conservation of Nature
"P351": "Q1345229", # NCBI Gene
"P4168": "Q112783946", # Immune epitope database
}
def _process(record):
return {HEADER.get(k, k): v for k, v in record.items()}
def _aggregate(prop, records):
databases = {record["database"]: record["databaseLabel"] for record in records}
if len(databases) == 1:
canonical_database = list(databases)[0]
elif prop not in CANONICAL_DATABASES:
logger.warning(f"need to curate which is the canonical database for {prop}: {databases}")
canonical_database = list(databases)[0]
else:
canonical_database = CANONICAL_DATABASES[prop]
records = [
dict(record_tuple)
for record_tuple in sorted(
{
tuple(sorted(record.items()))
for record in records
if record["database"] == canonical_database and record["prop"] == prop
}
)
]
if len(records) == 1:
return records[0]
# Throw my hands up in the air! It's probably just because of different URI formatters.
return records[0]
def _main():
import click
r = get_wikidata(force_download=True)
click.echo(f"Got {len(r)} records")
if __name__ == "__main__":
_main()