Source code for bioregistry.external.integbio

"""Download the Integbio registry."""

from __future__ import annotations

import json
import logging
from collections.abc import Sequence
from pathlib import Path
from typing import Any, ClassVar

import pandas as pd
import requests
from bs4 import BeautifulSoup

from bioregistry.external.alignment_utils import Aligner

__all__ = [
    "IntegbioAligner",
    "get_integbio",
]

logger = logging.getLogger(__name__)

DIRECTORY = Path(__file__).parent.resolve()
PROCESSED_PATH = DIRECTORY / "processed.json"

SKIP = [
    "Link(s) to Downloadable data",
    "Link(s) to Metadata of downloadable data",
    "Link(s) to Terms of use",
    "Link(s) to How to use",
    "Link(s) to API / SPARQL endopoint",
    "Link to LSDB Archive",
    "Link to MEDALS Database list",
    "Link to TogoTV",
    "Similar databases",
    "Record maintainer",
    "Record source",
    "Acquisition date",
    "Organism(s) covered",
    "Date of creation of this record",
    "Last update date of this record",
    "Country/Region",
    "J-GLOBAL ID",  # about institutions?
    "Contact information of database",  # nice idea, but curation is bad so effectively unusable
]


def get_url() -> str:
    """Scrape the current download URL for Integbio.

    :returns: The URL of the Integbio data download.
    :raises ValueError: if the URL can't be found

    .. warning::

        Integbio deletes its old files, so it's impossible to download an old version of the
        database
    """
    base = "https://catalog.integbio.jp/dbcatalog/en/download"
    download_prefix = "/dbcatalog/files/zip/en_integbio_dbcatalog_ccbysa_"
    download_suffix = "_utf8.csv.zip"

    res = requests.get(base, timeout=15)
    soup = BeautifulSoup(res.text, "html.parser")
    for anchor in soup.find_all("a"):
        href: str | None = anchor.attrs["href"]
        if href is None:
            continue
        if href.startswith(download_prefix) and href.endswith(download_suffix):
            return "https://catalog.integbio.jp" + href
    raise ValueError(f"unable to find Integbio download link on {base}")


def _parse_references(s: str) -> list[str]:
    rv = []
    for part in s.strip().split("||"):
        ref = _parse_reference(part.strip())
        if ref:
            rv.append(ref)
    return rv


def _parse_reference(part: str) -> str | None:
    if "\\" in part:  # it's pubmed followed by equvalent DOI
        pubmed, _doi = part.split("\\")
        return pubmed
    if part.isnumeric():  # it's pubmed
        return part
    if part == "etc.":
        return None
    logger.debug(f"IntegBio unhandled reference part: {part}")
    return None


def _strip_split(s: str | None) -> list[str] | None:
    if pd.isna(s) or not s:
        return None
    return [k.strip() for k in s.strip().split("||")]


def _parse_fairsharing_url(s: str) -> str | None:
    if s.startswith("https://fairsharing.org/10.25504/"):
        return s.removeprefix("https://fairsharing.org/10.25504/")
    elif s.startswith("https://fairsharing.org/"):
        return s.removeprefix("https://fairsharing.org/")
    logger.debug(f"unhandled FAIRsharing: {s}")
    return None



[docs]
def get_integbio(*, force_download: bool = False) -> dict[str, dict[str, Any]]:
    """Get the integbio resource."""
    url = get_url()
    df = pd.read_csv(url)
    df.rename(
        columns={
            "Database ID": "prefix",
            "Database name": "name",
            "Alternative name": "alt_name",
            "Database description": "description",
            "URL": "homepage",
            "Link to FAIRsharing": "fairsharing",
            "Reference(s) - PubMed ID/DOI": "references",
            "Language(s)": "languages",
            "Database maintenance site": "maintainer",
            "Tag - Target": "target_keywords",
            "Tag - Information type": "information_keywords",
            "Operational status": "status",
        },
        inplace=True,
    )
    for key in SKIP:
        del df[key]

    df["fairsharing"] = df["fairsharing"].map(_parse_fairsharing_url, na_action="ignore")
    df = df[df["languages"] != "ja"]  # skip only japanese language database for now
    del df["languages"]
    # df["languages"] = df["languages"].map(_strip_split, na_action="ignore")
    df["target_keywords"] = df["target_keywords"].map(_strip_split, na_action="ignore")
    df["information_keywords"] = df["information_keywords"].map(_strip_split, na_action="ignore")
    df["pubmeds"] = df["references"].map(_parse_references, na_action="ignore")
    df["description"] = df["description"].map(lambda s: s.replace("\r\n", "\n"), na_action="ignore")

    del df["references"]
    # TODO ground database maintenance with ROR?
    rv = {}
    for _, row in df.iterrows():
        rv[row["prefix"].lower()] = {k: v for k, v in row.items() if isinstance(v, (str, list))}
    PROCESSED_PATH.write_text(json.dumps(rv, indent=True, ensure_ascii=False, sort_keys=True))
    return rv



class IntegbioAligner(Aligner):
    """Aligner for the Integbio."""

    key = "integbio"
    alt_key_match = "name"
    getter = get_integbio
    curation_header: ClassVar[Sequence[str]] = ("name", "alt_name", "homepage")


if __name__ == "__main__":
    IntegbioAligner.cli()