NLP | Likely words

Code # 1: Create a Function

# Loading libraries

from nltk.probability import FreqDist, ConditionalFreqDist

 
# Create a function

def word_tag_model (words, tagged_words, limit = 200 ):

  

fd = FreqDist (words)

cfd = ConditionalFreqDist (tagged_words)

most_freq = (word for word, count in fd.most_common (limit))

 

return dict ((word, cfd [word]. max ()) 

for word in most_freq)

Code # 2: Using the function with UnigramTagger

# loading libraries

from tag_util import word_tag_model

from nltk. corpus import treebank

from nltk.tag import UnigramTagger

 
# initialize training and testing the set

train_data = treebank.tagged_sents () [: 3000 ]

test_data = treebank.tagged_sents () [ 3000 :]

 
# Model initialization

model = word_tag_model (treebank.words (), 

treebank.tagged_words ())

 
# Unigram initialization

tag = UnigramTagger (model = model)

 

print ( "Accuracy:" , tag.evaluate (test_data))

Output:

 Accuracy: 0.559680552557738 

Code No. 3: Let`s try a flail rollback point

# Loading libraries

from nltk.tag import UnigramTagger

from nltk.tag import DefaultTagger

 

default_tagger = DefaultTagger ( `NN` )

 

likely_tagger = UnigramTagger (

model = model, backoff = default_tagger)

 

tag = backoff_tagger (train_sents, [

UnigramTagger, BigramTagger, 

TrigramTagger], backoff = likely_tagger)

  

print ( "Accuracy:" , tag.evaluate (test_data))

Output:

 Accuracy: 0.8806820634578028 

Note: chain rollback improves accuracy. We can further improve this result by effectively using the UnigramTagger class.

Code # 4: Manually replacing trained taggers

# Loading Libraries

from nltk.tag import UnigramTagger

from nltk .tag import DefaultTagger

 

default_tagger = DefaultTagger ( `NN` )

  

tagger = backoff_tagger (train_sents, [

UnigramTagger, BigramTagger,

TrigramTagger], backoff = default_tagger)

 

likely_tag = UnigramTagger (model = model, backoff = tagger)

 

print ( "Accuracy:" , likely_tag.evaluate (test_data))

Output:

 Accuracy: 0.8824088063889488