Source code for underthesea.pipeline.pos_tag

# -*- coding: utf-8 -*-
from underthesea import word_tokenize
from underthesea.models.fast_crf_sequence_tagger import FastCRFSequenceTagger
from .model_crf import CRFPOSTagPredictor
from os.path import dirname, join

pos_model_v2 = None


[docs]def pos_tag(sentence, format=None, model=None): global pos_model_v2 """ Vietnamese POS tagging Parameters ========== sentence: {unicode, str} Raw sentence Returns ======= tokens: list of tuple with word, pos tag tagged sentence Examples -------- >>> # -*- coding: utf-8 -*- >>> from underthesea import pos_tag >>> sentence = "Chợ thịt chó nổi tiếng ở TPHCM bị truy quét" >>> pos_tag(sentence) [('Chợ', 'N'), ('thịt', 'N'), ('chó', 'N'), ('nổi tiếng', 'A'), ('ở', 'E'), ('TPHCM', 'Np'), ('bị', 'V'), ('truy quét', 'V')] """ sentence = word_tokenize(sentence) if model == "v2.0": if pos_model_v2 is None: pos_model_v2 = FastCRFSequenceTagger() wd = dirname(__file__) pos_model_v2.load(join(wd, "models", "pos_crf_vlsp2013_20230303")) tokens = sentence features = [[token] for token in sentence] tags = pos_model_v2.predict(features) # output of pos_model_v2 in in BOI format B-N, B-CH, B-V,... # remove prefix B- tags = [tag[2:] for tag in tags] result = list(zip(tokens, tags)) else: crf_model = CRFPOSTagPredictor.Instance() result = crf_model.predict(sentence, format) return result