"""Functionality for parsing IRIs."""
from __future__ import annotations
from functools import lru_cache
from typing import Literal, overload
import curies
from curies import ReferenceTuple
from .constants import FailureReturnType, MaybeCURIE, NonePair, get_failure_return_type
from .resource_manager import manager
__all__ = [
"curie_from_iri",
"get_default_converter",
"get_preferred_converter",
"normalize_curie",
"normalize_parsed_curie",
"normalize_prefix",
"parse_curie",
"parse_iri",
]
[docs]
def get_default_converter() -> curies.Converter:
"""Get a converter from this manager."""
return manager.converter
[docs]
@lru_cache(1)
def get_preferred_converter() -> curies.Converter:
"""Get a converter from this manager with preferred CURIE prefixes and RDF URI prefixes."""
return manager.get_converter(
prefix_priority=["preferred", "default"],
uri_prefix_priority=["rdf", "default"],
)
[docs]
def curie_from_iri(
iri: str,
*,
use_preferred: bool = False,
) -> str | None:
"""Generate a CURIE from an IRI via :meth:`Manager.compress`.
:param iri: A valid IRI
:param use_preferred: If set to true, uses the "preferred prefix", if available,
instead of the canonicalized Bioregistry prefix.
:returns: A CURIE string, if the IRI can be parsed.
>>> curie_from_iri("http://purl.bioontology.org/ontology/NCBITAXON/131567")
'ncbitaxon:131567'
"""
rv = parse_iri(
iri, use_preferred=use_preferred, on_failure_return_type=FailureReturnType.single
)
if rv:
return rv.curie
return None
# docstr-coverage:excused `overload`
@overload
def parse_iri(
iri: str,
*,
use_preferred: bool = ...,
strict: Literal[True] = ...,
on_failure_return_type: FailureReturnType = ...,
) -> ReferenceTuple: ...
# docstr-coverage:excused `overload`
@overload
def parse_iri(
iri: str,
*,
use_preferred: bool = ...,
strict: Literal[False] = ...,
on_failure_return_type: Literal[FailureReturnType.pair] = FailureReturnType.pair,
) -> ReferenceTuple | NonePair: ...
# docstr-coverage:excused `overload`
@overload
def parse_iri(
iri: str,
*,
use_preferred: bool = ...,
strict: Literal[False] = ...,
on_failure_return_type: Literal[FailureReturnType.single],
) -> ReferenceTuple | None: ...
[docs]
def parse_iri(
iri: str,
*,
use_preferred: bool = False,
strict: bool = False,
on_failure_return_type: FailureReturnType = FailureReturnType.pair,
) -> ReferenceTuple | NonePair | None:
"""Parse a compact identifier from an IRI that wraps :meth:`Manager.parse_uri`.
:param iri: A valid IRI
:param use_preferred: If set to true, uses the "preferred prefix", if available,
instead of the canonicalized Bioregistry prefix.
:param strict: If set to true, requires parsing to succeed
:param on_failure_return_type: whether to return a single None or a pair of None's
:returns: A pair of prefix/identifier, if can be parsed
:raises TypeError: if an invalid on_failure_return_type is given
"""
rv: ReferenceTuple | None = get_default_converter().parse_uri(iri, strict=strict) # type:ignore[call-overload]
if rv is None:
return get_failure_return_type(on_failure_return_type)
# don't invoke the manager until it's needed
if not use_preferred:
return rv
return manager.make_preferred(rv, use_preferred=True)
# docstr-coverage:excused `overload`
@overload
def normalize_curie(
curie: str,
*,
sep: str = ...,
use_preferred: bool = ...,
strict: Literal[True] = True,
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def normalize_curie(
curie: str,
*,
sep: str = ...,
use_preferred: bool = ...,
strict: Literal[False] = False,
) -> str | None: ...
[docs]
def normalize_curie(
curie: str,
*,
sep: str = ":",
use_preferred: bool = False,
strict: bool = False,
) -> str | None:
"""Normalize a CURIE.
:param curie: A compact URI (CURIE) in the form of <prefix:identifier>
:param sep: The separator for the CURIE. Defaults to the colon ":" however the slash
"/" is sometimes used in Identifiers.org and the underscore "_" is used for OBO
PURLs.
:param use_preferred: If set to true, uses the "preferred prefix", if available,
instead of the canonicalized Bioregistry prefix.
:param strict: If true, raises an error if the prefix can't be standardized
:returns: A normalized CURIE, if possible using the colon as a separator
>>> normalize_curie("pdb:1234")
'pdb:1234'
Fix commonly mistaken prefix
>>> normalize_curie("pubchem:1234")
'pubchem.compound:1234'
Address banana problem
>>> normalize_curie("GO:GO:1234")
'go:1234'
>>> normalize_curie("go:GO:1234")
'go:1234'
>>> normalize_curie("go:go:1234")
'go:1234'
>>> normalize_curie("go:1234")
'go:1234'
Address banana problem with OBO banana
>>> normalize_curie("fbbt:FBbt:1234")
'fbbt:1234'
>>> normalize_curie("fbbt:fbbt:1234")
'fbbt:1234'
>>> normalize_curie("fbbt:1234")
'fbbt:1234'
Address banana problem with explit banana
>>> normalize_curie("go.ref:GO_REF:1234")
'go.ref:1234'
>>> normalize_curie("go.ref:1234")
'go.ref:1234'
Parse OBO PURL curies
>>> normalize_curie("GO_1234", sep="_")
'go:1234'
Use preferred
>>> normalize_curie("GO_1234", sep="_", use_preferred=True)
'GO:1234'
"""
if strict:
return manager.normalize_curie(curie, sep=sep, use_preferred=use_preferred, strict=True)
return manager.normalize_curie(curie, sep=sep, use_preferred=use_preferred, strict=False)
# docstr-coverage:excused `overload`
@overload
def normalize_parsed_curie(
prefix: str,
identifier: str,
*,
use_preferred: bool = ...,
strict: Literal[True] = True,
) -> ReferenceTuple: ...
# docstr-coverage:excused `overload`
@overload
def normalize_parsed_curie(
prefix: str,
identifier: str,
*,
use_preferred: bool = ...,
strict: Literal[False] = False,
) -> ReferenceTuple | NonePair: ...
[docs]
def normalize_parsed_curie(
prefix: str,
identifier: str,
*,
use_preferred: bool = False,
strict: bool = False,
) -> ReferenceTuple | NonePair:
"""Normalize a prefix/identifier pair.
:param prefix: The prefix in the CURIE
:param identifier: The identifier in the CURIE
:param use_preferred: If set to true, uses the "preferred prefix", if available,
instead of the canonicalized Bioregistry prefix.
:param strict: If true, raises an error if the prefix can't be standardized
:returns: A normalized prefix/identifier pair, conforming to Bioregistry standards.
This means no redundant prefixes or bananas, all lowercase.
"""
if strict:
return manager.normalize_parsed_curie(
prefix,
identifier,
use_preferred=use_preferred,
strict=strict,
)
return manager.normalize_parsed_curie(
prefix,
identifier,
use_preferred=use_preferred,
on_failure_return_type=FailureReturnType.pair,
strict=strict,
)
# docstr-coverage:excused `overload`
@overload
def normalize_prefix(
prefix: str, *, use_preferred: bool = False, strict: Literal[True] = True
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def normalize_prefix(
prefix: str, *, use_preferred: bool = False, strict: Literal[False] = False
) -> str | None: ...
[docs]
def normalize_prefix(
prefix: str, *, use_preferred: bool = False, strict: bool = False
) -> str | None:
"""Get the normalized prefix, or return None if not registered.
:param prefix: The prefix to normalize, which could come from Bioregistry, OBO
Foundry, OLS, or any of the curated synonyms in the Bioregistry
:param strict: If true and the prefix could not be looked up, raises an error
:param use_preferred: If set to true, uses the "preferred prefix", if available,
instead of the canonicalized Bioregistry prefix.
:returns: The canonical Bioregistry prefix, it could be looked up. This will usually
take precedence: MIRIAM, OBO Foundry / OLS, Custom except in a few cases, such
as NCBITaxon.
This works for synonym prefixes, like:
>>> assert "ncbitaxon" == normalize_prefix("taxonomy")
This works for common mistaken prefixes, like:
>>> assert "pubchem.compound" == normalize_prefix("pubchem")
This works for prefixes that are often written many ways, like:
>>> assert "ec" == normalize_prefix("ec-code")
>>> assert "ec" == normalize_prefix("EC_CODE")
Get a "preferred" prefix:
>>> normalize_prefix("go", use_preferred=True)
'GO'
"""
if strict:
return manager.normalize_prefix(prefix, use_preferred=use_preferred, strict=True)
return manager.normalize_prefix(prefix, use_preferred=use_preferred, strict=False)
# docstr-coverage:excused `overload`
@overload
def parse_curie(
curie: str,
*,
sep: str = ...,
use_preferred: bool = ...,
on_failure_return_type: FailureReturnType = ...,
strict: Literal[True] = True,
) -> ReferenceTuple: ...
# docstr-coverage:excused `overload`
@overload
def parse_curie(
curie: str,
*,
sep: str = ...,
use_preferred: bool = ...,
on_failure_return_type: Literal[FailureReturnType.single],
strict: Literal[False] = False,
) -> ReferenceTuple | None: ...
# docstr-coverage:excused `overload`
@overload
def parse_curie(
curie: str,
*,
sep: str = ...,
use_preferred: bool = ...,
on_failure_return_type: Literal[FailureReturnType.pair] = FailureReturnType.pair,
strict: Literal[False] = False,
) -> ReferenceTuple | NonePair: ...
[docs]
def parse_curie(
curie: str,
*,
sep: str = ":",
use_preferred: bool = False,
on_failure_return_type: FailureReturnType = FailureReturnType.pair,
strict: bool = False,
) -> MaybeCURIE:
"""Parse a CURIE, normalizing the prefix and identifier if necessary.
:param curie: A compact URI (CURIE) in the form of <prefix:identifier>
:param sep: The separator for the CURIE. Defaults to the colon ":" however the slash
"/" is sometimes used in Identifiers.org and the underscore "_" is used for OBO
PURLs.
:param use_preferred: If set to true, uses the "preferred prefix", if available,
instead of the canonicalized Bioregistry prefix.
:param on_failure_return_type: whether to return a single None or a pair of None's
:returns: A tuple of the prefix, identifier, if parsable
:raises TypeError: If an invalid on_failure_return_type is given
The algorithm for parsing a CURIE is very simple: it splits the string on the
leftmost occurrence of the separator (usually a colon ":" unless specified
otherwise). The left part is the prefix, and the right part is the identifier.
>>> parse_curie("pdb:1234")
ReferenceTuple(prefix='pdb', identifier='1234')
Address banana problem
>>> parse_curie("go:GO:1234")
ReferenceTuple(prefix='go', identifier='1234')
>>> parse_curie("go:go:1234")
ReferenceTuple(prefix='go', identifier='1234')
>>> parse_curie("go:1234")
ReferenceTuple(prefix='go', identifier='1234')
Address banana problem with OBO banana
>>> parse_curie("fbbt:FBbt:1234")
ReferenceTuple(prefix='fbbt', identifier='1234')
>>> parse_curie("fbbt:fbbt:1234")
ReferenceTuple(prefix='fbbt', identifier='1234')
>>> parse_curie("fbbt:1234")
ReferenceTuple(prefix='fbbt', identifier='1234')
Address banana problem with explit banana
>>> parse_curie("go.ref:GO_REF:1234")
ReferenceTuple(prefix='go.ref', identifier='1234')
>>> parse_curie("go.ref:1234")
ReferenceTuple(prefix='go.ref', identifier='1234')
Parse OBO PURL curies
>>> parse_curie("GO_1234", sep="_")
ReferenceTuple(prefix='go', identifier='1234')
Banana with no peel
>>> parse_curie("omim.ps:PS12345")
ReferenceTuple(prefix='omim.ps', identifier='12345')
Use preferred (available)
>>> parse_curie("GO_1234", sep="_", use_preferred=True)
ReferenceTuple(prefix='GO', identifier='1234')
Use preferred (unavailable)
>>> parse_curie("pdb:1234", use_preferred=True)
ReferenceTuple(prefix='pdb', identifier='1234')
"""
if strict:
return manager.parse_curie(
curie,
sep=sep,
use_preferred=use_preferred,
strict=strict,
)
elif on_failure_return_type == FailureReturnType.single:
return manager.parse_curie(
curie,
sep=sep,
use_preferred=use_preferred,
on_failure_return_type=on_failure_return_type,
strict=strict,
)
elif on_failure_return_type == FailureReturnType.pair:
return manager.parse_curie(
curie,
sep=sep,
use_preferred=use_preferred,
on_failure_return_type=on_failure_return_type,
strict=strict,
)
else:
raise TypeError