API¶

`underthesea` Package¶

underthesea.info(version)[source]¶

Show information about underthesea package

Parameters:	version (str) – version of package

class underthesea.underthesea.Corpus[source]¶

Corpus is fundamental resource of NLP

load(folder)[source]¶

save(folder)[source]¶

`corpus` Package¶

class underthesea.corpus.corpus.Corpus[source]¶: Interface for corpus

class underthesea.corpus.readers.dictionary_loader.DictionaryLoader(filepath)[source]¶

words¶

class underthesea.corpus.plaintext.PlainTextCorpus[source]¶

Bases: underthesea.corpus.corpus.Corpus

class for handling plain text corpus

load(folder)[source]¶

load plaintext folder to documents and apply unicode transformer

Parameters:	folder (string) – path to directory

save(folder)[source]¶

save corpus to files

Parameters:	folder (string) – path to directory

class underthesea.corpus.document.Document(id)[source]¶

set_content(content)[source]¶

set_sentences(sentences)[source]¶

`transformer` Package¶

`word_sent` Package¶

underthesea.word_sent.tokenize(sentence)¶

tokenize a sentence

Parameters:	text – raw text input
Returns:	tokenize text
Return type:	unicode\|str

# -*- coding: utf-8 -*-
>>> from underthesea.word_sent.tokenize import tokenize
>>> text = u"Đám cháy bùng phát trưa nay, 7/4, tại khu nhà tôn ngay gần tòa nhà Keangnam, đường Phạm Hùng. Ngọn lửa cùng khói đen bốc lên dữ dội làm đen kịt một góc không gian. Giao thông quanh khu vực bị ảnh hưởng, trong đó đường trên cao bị tắc một đoạn khá dài..."

>>> tokenize(text)
u"Đám cháy bùng phát trưa nay , 7 / 4 , tại khu nhà tôn ngay gần tòa nhà Keangnam , đường Phạm Hùng . Ngọn lửa cùng khói đen bốc lên dữ dội làm đen kịt một góc không gian . Giao thông quanh khu vực bị ảnh hưởng , trong đó đường trên cao bị tắc một đoạn khá dài ..."

underthesea.word_sent(sentence)¶

word segmentation

Parameters:	sentence (unicode\|str) – raw sentence
Returns:	segmented sentence
Return type:	unicode\|str

>>> # -*- coding: utf-8 -*-
>>> from underthesea import word_sent
>>> sentence = u"Chúng ta thường nói đến Rau sạch , Rau an toàn để phân biệt với các rau bình thường bán ngoài chợ ."

>>> word_sent(sentence)
[u"Chúng ta", u"thường", u"nói", u"đến", u"Rau sạch", u",", u"Rau", u"an toàn", u"để", u"phân biệt", u"với",
u"các", u"rau", u"bình thường", u"bán", u"ngoài", u"chợ", u"."]

>>> word_sent(sentence, format="text")
u'Chúng_ta thường nói đến Rau_sạch , Rau an_toàn để phân_biệt với các rau bình_thường bán ngoài chợ .'

`pos_tag` Package¶

underthesea.pos_tag(sentence)¶

part of speech tagging

Parameters:	sentence (unicode\|str) – raw sentence
Returns:	tagged sentence
Return type:	list

>>> # -*- coding: utf-8 -*-
>>> from underthesea import pos_tag
>>> text = u"Chợ thịt chó nổi tiếng ở TP Hồ Chí Minh bị truy quét"
>>> pos_tag(text)
[(u'Chợ', 'N'),
 (u'thịt', 'N'),
 (u'chó', 'N'),
 (u'nổi tiếng', 'A'),
 (u'ở', 'E'),
 (u'TP HCM', 'Np'),
 (u'bị', 'V'),
 (u'truy quét', 'V')]

`chunking` Package¶

underthesea.chunk(sentence)¶

chunk a sentence to phrases

Parameters:	sentence (unicode) – raw sentence
Returns:	list of tuple with word, pos tag, chunking tag
Return type:	list

>>> # -*- coding: utf-8 -*-
>>> from underthesea import chunk
>>> text = u"Bác sĩ bây giờ có thể thản nhiên báo tin bệnh nhân bị ung thư?"
>>> chunk(text)
[(u'Bác sĩ', 'N', 'B-NP'),
 (u'bây giờ', 'P', 'I-NP'),
 (u'có thể', 'R', 'B-VP'),
 (u'thản nhiên', 'V', 'I-VP'),
 (u'báo tin', 'N', 'B-NP'),
 (u'bệnh nhân', 'N', 'I-NP'),
 (u'bị', 'V', 'B-VP'),
 (u'ung thư', 'N', 'I-VP'),
 (u'?', 'CH', 'O')]

`ner` Package¶

underthesea.ner(sentence)¶

location and classify named entities in text

Parameters:	sentence (unicode) – raw sentence
Returns:	list of tuple with word, pos tag, chunking tag, ner tag
Return type:	list

>>> # -*- coding: utf-8 -*-
>>> from underthesea import ner
>>> text = u"Chưa tiết lộ lịch trình tới Việt Nam của Tổng thống Mỹ Donald Trump"
>>> ner(text)
[('Chưa', 'R', 'O', 'O'),
 ('tiết lộ', 'V', 'B-VP', 'O'),
 ('lịch trình', 'V', 'B-VP', 'O'),
 ('tới', 'E', 'B-PP', 'O'),
 ('Việt Nam', 'Np', 'B-NP', 'B-LOC'),
 ('của', 'E', 'B-PP', 'O'),
 ('Tổng thống', 'N', 'B-NP', 'O'),
 ('Mỹ', 'Np', 'B-NP', 'B-LOC'),
 ('Donald', 'Np', 'B-NP', 'B-PER'),
 ('Trump', 'Np', 'B-NP', 'I-PER')]

`classify` Package¶

underthesea.classify(text)¶

Text classification

Parameters:	sentence (unicode) – raw text
Returns:	list of labels
Return type:	list

>>> # -*- coding: utf-8 -*-
>>> from underthesea import classify
>>> classify("HLV đầu tiên ở Premier League bị sa thải sau 4 vòng đấu")
['The thao']
>>> classify("Hội đồng tư vấn kinh doanh Asean vinh danh giải thưởng quốc tế")
['Kinh doanh']
>>> classify("Đánh giá “rạp hát tại gia” Samsung Soundbar Sound+ MS750")
['Vi tinh']

API¶

underthesea Package¶

corpus Package¶

transformer Package¶

word_sent Package¶

pos_tag Package¶

chunking Package¶

ner Package¶

classify Package¶