Source code for underthesea.pipeline.ner

# -*- coding: utf-8 -*-
from underthesea import chunk
from .model_crf import CRFNERPredictor


[docs]def ner(sentence, format=None, deep=False): """ Location and classify named entities in text Parameters ========== sentence: {unicode, str} raw sentence Returns ======= tokens: list of tuple with word, pos tag, chunking tag, ner tag tagged sentence Examples -------- >>> # -*- coding: utf-8 -*- >>> from underthesea import ner >>> sentence = "Ông Putin ca ngợi những thành tựu vĩ đại của Liên Xô" >>> ner(sentence) [('Ông', 'Nc', 'B-NP', 'O'), ('Putin', 'Np', 'B-NP', 'B-PER'), ('ca ngợi', 'V', 'B-VP', 'O'), ('những', 'L', 'B-NP', 'O'), ('thành tựu', 'N', 'B-NP', 'O'), ('vĩ đại', 'A', 'B-AP', 'O'), ('của', 'E', 'B-PP', 'O'), ('Liên Xô', 'Np', 'B-NP', 'B-LOC')] """ if not deep: sentence = chunk(sentence) crf_model = CRFNERPredictor.Instance() result = crf_model.predict(sentence, format) return result else: from .model_transformers import nlp output = nlp(sentence) if len(output) == 0: return [] entities = [output[0]] for item in output[1:]: if item["word"].startswith("##"): entities[-1]["word"] = entities[-1]["word"] + item["word"][2:] entities[-1]["end"] = item["end"] else: entities.append(item) return entities