Source code for finntk.omor.seg

"""
Functions for basic processing of OMorFi segment labelling style analyses.
"""
import re
from more_itertools import split_at

LABELSEGMENT_RE = r"""
    \{ (?P<seg> [^\}]* ) \} |
    \[ (?P<tag> [^\]]* ) \] |
    (?P<surf> [^\[\{]+ )
"""

_labelsegment_lex = None


def get_labelsegment_lex():
    global _labelsegment_lex
    if _labelsegment_lex is None:
        _labelsegment_lex = re.compile(LABELSEGMENT_RE, re.VERBOSE)
    return _labelsegment_lex


def labelsegment_to_tokens(labelsegmented):
    lex = get_labelsegment_lex()
    for match in re.finditer(lex, labelsegmented):
        typ = match.lastgroup
        value = match.group(typ)
        yield typ, value


def tokens_to_labelsegment(tokens):
    bits = []
    for typ, value in tokens:
        if typ == "seg":
            bits.append("{" + value + "}")
        elif typ == "tag":
            bits.append("[" + value + "]")
        elif typ == "surf":
            bits.append(value)
        else:
            assert False
    return "".join(bits)


def tokens_to_subword_tokens(it):

    def is_cmp_bound(kv):
        return (kv[0] == "seg" and kv[1] == "wB")

    return split_at(it, is_cmp_bound)


[docs]def tokens_to_surf(it): """ Given an iterator of segments as (type, value) tuples, reconstruct the surface string. """ return "".join(v for (t, v) in it if t == "surf")
[docs]def labelsegment_to_subword_tokens(labelsegmented): """ Returns a iterator of segments specified as (type, value) tuple. Type is one of "seg", "tag" or "surf". """ return tokens_to_subword_tokens(labelsegment_to_tokens(labelsegmented))