WordNet is the lexical database i.e. dictionary for the English language, specifically designed for natural language processing.
Code #1 : Creating class to look up words in WordNet.
from nltk.tag import SequentialBackoffTagger from nltk.corpus import wordnet from nltk.probability import FreqDist class WordNetTagger(SequentialBackoffTagger): ''' >>> wt = WordNetTagger() >>> wt.tag(['food', 'is', 'great']) [('food', 'NN'), ('is', 'VB'), ('great', 'JJ')] ''' def __init__( self , * args, * * kwargs): SequentialBackoffTagger.__init__( self , * args, * * kwargs) self .wordnet_tag_map = { 'n' : 'NN' , 's' : 'JJ' , 'a' : 'JJ' , 'r' : 'RB' , 'v' : 'VB' } def choose_tag( self , tokens, index, history): word = tokens[index] fd = FreqDist() for synset in wordnet.synsets(word): fd[synset.pos()] + = 1 return self .wordnet_tag_map.get(fd. max ()) |
This WordNetTagger class will count the no. of each POS tag found in the Synsets for a word and then, the most common tag is to treebank tag using internal mapping.
Code #2 : Using a simple WordNetTagger()
from taggers import WordNetTagger from nltk.corpus import treebank # Initializing default_tag = DefaultTagger( 'NN' ) # initializing training and testing set train_data = treebank.tagged_sents()[: 3000 ] test_data = treebank.tagged_sents()[ 3000 :] wn_tagging = WordNetTagger() a = wn_tagger.evaluate(test_data) print ( "Accuracy of WordNetTagger : " , a) |
Output :
Accuracy of WordNetTagger : 0.17914876598160262
Using Code 3, we can improve the accuracy.
Code #3 : WordNetTagger class at the end of an NgramTagger backoff chain
from taggers import WordNetTagger from nltk.corpus import treebank from tag_util import backoff_tagger from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger # Initializing default_tag = DefaultTagger( 'NN' ) # initializing training and testing set train_data = treebank.tagged_sents()[: 3000 ] test_data = treebank.tagged_sents()[ 3000 :] tagger = backoff_tagger(train_data, [UnigramTagger, BigramTagger, TrigramTagger], backoff = wn_tagger) a = tagger.evaluate(test_data) print ( "Accuracy : " , a) |
Output :
Accuracy : 0.8848262464925534