Source code for finntk.omor.extract

"""
Functions for extracting lemmas from OMorFi analyses.
"""
from .inst import get_omorfi
from .anlys import (
    analysis_to_subword_dicts,
    default_lemmatise,
    lemmas_of_subword_dicts,
    ext_lemma_feats,
    true_lemmatise,
)


def contig_slices(elems):
    elems = list(elems)
    n = len(elems)
    for start in range(n):
        for end in range(start + 1, n + 1):
            yield elems[start:end]


def iden_func(x):
    return x


def _extract_lemmas(
    word_form,
    get_slices,
    lemmatise_func=default_lemmatise,
    norm_func=iden_func,
    return_feats=False,
    return_pos=False,
):
    assert not return_pos or return_feats  # return_pos => return_feats
    omorfi = get_omorfi()
    analyses = omorfi.analyse(word_form)
    res = {} if return_feats else set()
    for analysis in analyses:
        if analysis.is_oov():
            continue
        analysis_dicts = analysis_to_subword_dicts(analysis.raw)
        for analysis_slice in get_slices(analysis_dicts):
            lemma_feats = lemmas_of_subword_dicts(
                analysis_slice,
                lemmatise_func=lemmatise_func,
                return_feats=return_feats,
                return_pos=return_pos,
            )
            if return_pos:
                lemma_feats_inner, upos = lemma_feats
                for lemma, feats in lemma_feats_inner.items():
                    ext_lemma_feats(
                        res, norm_func(lemma), ((upos, feat) for feat in feats)
                    )
            elif return_feats:
                for lemma, feats in lemma_feats.items():
                    ext_lemma_feats(res, norm_func(lemma), feats)
            else:
                for lemma in lemma_feats:
                    res.add(norm_func(lemma))
    return res


[docs]def extract_lemmas(word_form): """ Extract lemmas specifically mentioned by OMorFi. """ return _extract_lemmas( word_form, lambda analysis_dicts: [[d] for d in analysis_dicts] )
[docs]def extract_lemmas_span(word_form): """ Works like `extract_lemmas`, but doesn't extract individual subwords. However, if a word is only recognised by as a compound word by OMorFi it will glue the parts together, lemmatising only the last subword. This means it extracts only lemmas which span the whole word form. """ return _extract_lemmas(word_form, lambda analysis_dicts: [analysis_dicts])
[docs]def extract_true_lemmas_span(word_form, norm_func=iden_func, return_pos=False): """ Works like `extract_lemmas_span`, but uses `true_lemmatise`. It also returns some of the features associated with each lemma. """ return _extract_lemmas( word_form, lambda analysis_dicts: [analysis_dicts], lemmatise_func=true_lemmatise, norm_func=norm_func, return_feats=True, return_pos=return_pos, )
[docs]def extract_lemmas_combs(word_form): """ Works like `extract_lemmas`, but also tries to combine adjacent subwords to make lemmas which may be out of volcaburary for OMorFi. Note that this will over generate (by design). For example: voileipäkakku will generate voi, voileipä and voileipäkakku as desired, but will also spuriously generate leipäkakku. """ return _extract_lemmas(word_form, contig_slices)
[docs]def extract_lemmas_recurs(word_form): """ Works like `extract_lemmas`, but also tries to expand each lemma into more lemmas. This helps in some cases (but can overgenerate even more). For example, it will mean that synnyinkaupunkini will generate synty, kaupunki, synnyinkaupunki, synnyin and syntyä. """ expand_queue = [word_form] res = set() while len(expand_queue) > 0: word_form = expand_queue.pop() new_lemmas = extract_lemmas_combs(word_form) novel_lemmas = new_lemmas - res expand_queue.extend(novel_lemmas) for lemma in novel_lemmas: res.add(lemma) return res
[docs]def lemma_intersect(toks1, toks2): """ Given two iterables of tokens, return the intersection of their lemmas. This can work as a simple, high recall, method of matching for example, two inflected noun phrases. """ if len(toks1) != len(toks2): return res = [] for t1, t2 in zip(toks1, toks2): l1 = extract_lemmas_span(t1) l2 = extract_lemmas_span(t2) inter = l1 & l2 if len(inter) == 0: return res.append(inter) return res