词性标注POS tagging

爱被打了一巴掌 2022-09-22 13:56 203阅读 0赞

什么是词性标注，[Part-of-speech tagging][]

比如下面一段标注过词性的文字文字，用空格分开后，/前面的是英文单词，后面表示它的词性。

Confidence/NN in/IN the/DT pound/NN is/VBZ widely/RB expected/VBN to/TO take/VB another/DT sharp/JJ dive/NN if/IN trade/NN figures/NNS for/IN September/NNP ,/, due/JJ for/IN release/NN tomorrow/NN ,/, fail/VB to/TO show/VB a/DT substantial/JJ improvement/NN from/IN July/NNP and/CC August/NNP 's/POS near-record/JJ deficits/NNS ./.
    Chancellor/NNP of/IN the/DT Exchequer/NNP Nigel/NNP Lawson/NNP 's/POS restated/VBN commitment/NN to/TO a/DT firm/NN monetary/JJ policy/NN has/VBZ helped/VBN to/TO prevent/VB a/DT freefall/NN in/IN sterling/NN over/IN the/DT past/JJ week/NN ./.

上面NN是名词，IN是介词或从属连词，DT: determiner 表示限定词。。。

问题是现在要给一段未标注词性的文字的每个单词标注词性。

HMM、最大熵模型、crf都可以完成这一任务

HMM

用HMM做词性标注和HMM做中文分词类似，也可以看成是序列标注问题

[基于隐马尔可夫模型的有监督词性标注][Link 1]

[HMM在自然语言处理中的应用一：词性标注][HMM]

[词性标注][Link 2]

#coding:utf-8
    import re
    
    from dicts import DefaultDict
    from random import choice
    
    def Dict(**args): 
        """Return a dictionary with argument names as the keys, 
        and argument values as the key values"""
        return args
    
    def hmm(training_sentences, reducedtagset):
        """Given a list of pre-tagged sentences, return an HMM tuple containing
        the transition (1) and emission (2) probabilities"""
        transitions = DefaultDict(DefaultDict(0))
        emissions = DefaultDict(DefaultDict(0))
        wordcounts = DefaultDict(0)
        tagcounts = DefaultDict(0)
    
        for line in training_sentences:
    	prevtag = '<START>'   # Before each sentence, begin in START state
            tagcounts['<START>'] += 1
    	for taggedword in line.split():
    	    (word, tag) = re.split('(?<!\\\)\/', taggedword)
    
                if reducedtagset:
                	if re.match('VB', tag) is not None: tag = 'VB'
                	elif re.match('NN', tag) is not None: tag = 'NN'
               	elif re.match('JJ', tag) is not None: tag = 'JJ'
                	elif re.match('RB', tag) is not None: tag = 'RB'
    
    	    transitions[prevtag][tag] += 1
    	    emissions[tag][word] += 1
    	    wordcounts[word] += 1
                tagcounts[tag] += 1
                prevtag = tag
    
        print emissions.keys()
        
        return hmmtuple(transitions, emissions, wordcounts, tagcounts)
    
    def hmmtuple(transitions, emissions, wordcounts, tagcounts):    
        # At test time we will need estimates for "unknown words"---the words
        # the words that never occurred in the training data.  One recommended
        # way to do this is to turn all training words occurring just once 
        # into '<UNKNOWN>' and use this as the stand-in for all "unknown words"
        # at test time.  Below we make all the necessary transformations
        # to '<UNKNOWN>'.
        for tag,dict in emissions.items():
    	for word,count in dict.items():
    	    if wordcounts[word] == 1:
    		del emissions[tag][word]
    		emissions[tag]['<UNKNOWN>'] += 1
    
        # Calculate smoothed conditional probabilities
        tags = emissions.keys()
        words = wordcounts.keys()
    
        for prevtag in transitions.keys():
            for tag in tags: #transitions[prevtag]:
                transitions[prevtag][tag] = (transitions[prevtag][tag]+1.)/(tagcounts[prevtag]+len(tags))
                #transitions[prevtag][tag] *= 1./tagcounts[prevtag]
    
        for tag in emissions.keys():
            for word in words: #emissions[tag]:
                emissions[tag][word] = (emissions[tag][word]+1.)/(tagcounts[tag]+len(wordcounts))
                #emissions[tag][word] *= 1./tagcounts[tag]
    
        #print len(transitions), len(emissions), len(tagcounts)
        return (transitions, emissions, tags)
    
    def strip_tags(tagged_sentences):
        """Given a list of tagged sentences, return a list of untagged sentences"""
        untagged_sentences = []
        for taggedsent in tagged_sentences:
            untaggedsent = ''
    	for taggedword in taggedsent.split():
    	    word = re.split('(?<!\\\)\/', taggedword)[0]
                untaggedsent += word + ' '
            #print untaggedsent
            untagged_sentences.append(untaggedsent)
        return untagged_sentences
    
    def maxsequence(probtable, tags):
        """Given a filled Viterbi probabibility table, return the most likely 
        sequence of POS tags"""
        r = len(probtable)
        c = len(probtable[0])
    
        maxfinalprob = 0
        maxfinaltag = None
        for i in range(r):
            if (probtable[i][c-1][0] > maxfinalprob):
                maxfinalprob = probtable[i][c-1][0]
                maxfinaltag = i
    
        #print maxfinaltag
    
        maxsequence = []
        prevmaxtag = maxfinaltag
        for j in range(c-1, -1, -1):
            maxsequence.insert(0, tags[prevmaxtag])
            #print probtable[prevmaxtag][j][1]
            prevmaxtag = probtable[prevmaxtag][j][1]
    	    
        return maxsequence
    
    def viterbi_tags (untagged_sentences, h):
        """Given a list of untagged sentences, return the most likely sequence of
        POS tags"""
        transitions = h[0]
        emissions = h[1]
        tags = h[2]
        maxtags = []
        #print tags
    
        for untaggedsent in untagged_sentences:
            #Create empty probtable
            words = untaggedsent.split()
            r = len(tags)
            c = len(words)
            probtable = [None]*r
            for i in range(r):
                probtable[i] = [None]*c
                for j in range(c):
                    probtable[i][j] = [None]*2
    
            #Initialize zeroth column of probtable
            prevtag = '<START>'
            word = words[0]
            for i in range(r):
                tag = tags[i]
    
                transition = transitions[prevtag][tag]
                if word in emissions[tag]:
                    emission = emissions[tag][word]
                else:
                    emission = .0001*emissions[tag]['<UNKNOWN>']
    
                probtable[i][0][0] = transition*emission
            
            #Fill in probtable
            for j in range(1, c):
                word = words[j]
                for i in range(r):
                    tag = tags[i]
                    maxprob = 0
                    maxtag = None
    
                    if word in emissions[tag]:
                        emission = emissions[tag][word]
                    else:
                        emission = .0001*emissions[tag]['<UNKNOWN>']
    
                    for k in range(r):
                        prevtag = tags[k]
                        transition = transitions[prevtag][tag]
                        prob = probtable[k][j-1][0]*transition*emission
                        
                        if (prob > maxprob):
                            maxprob = prob
                            maxtag = k
    
                    probtable[i][j][0] = maxprob
                    probtable[i][j][1] = maxtag
    
            #Find most likely sequence of POS tags of this sentence
            sentmaxtags = maxsequence(probtable, tags)
            maxtags.extend(sentmaxtags)
    
        #Return most likely sequence of POS tags of all sentences
        return maxtags
    
    def true_tags (tagged_sentences):
        """Given a list of tagged sentences, return the tag sequence"""
        tags = []
        for sent in tagged_sentences:
            tags.extend([re.split('(?<!\\\)\/', word)[1] for word in sent.split()])
        return tags
    
    def compare(mytags, truetags, reducedtagset):
        #print mytags, truetags
        score = 0
        length = len(mytags)
        for i in range(length):
    	truetag = truetags[i]
    	if reducedtagset:
                if re.match('VB', truetag) is not None: truetag = 'VB'
                elif re.match('NN', truetag) is not None: truetag = 'NN'
                elif re.match('JJ', truetag) is not None: truetag = 'JJ'
                elif re.match('RB', truetag) is not None: truetag = 'RB'
    
            if mytags[i] == truetag: score += 1
        
        return 1.*score/length
    
    if __name__ == '__main__':
        f = open('wsj15-18.pos').readlines()
        
        #90% of data is used for training
        print '90% of data is used for training'
        print '--------------------------------'
        i = int(len(f)*.9)
        h = hmm(f[:i], False)
    
        test1 = f[i:]
        v1 = viterbi_tags(strip_tags(test1), h)
        t1 = true_tags(test1)
        c1 = compare(v1, t1, False)
        print c1
    
        test2 = open('wsj_0159.pos').readlines()
        v2 = viterbi_tags(strip_tags(test2), h)
        t2 = true_tags(test2)
        c2 = compare(v2, t2, False)
        print c2

[200行Python代码实现感知机词性标注器][200_Python]

[Part-of-speech tagging]: https://en.wikipedia.org/wiki/Part-of-speech_tagging
[Link 1]: http://blog.csdn.net/xum2008/article/details/38309605
[HMM]: http://www.52nlp.cn/hmm-application-in-natural-language-processing-one-part-of-speech-tagging-1
[Link 2]: http://www.hankcs.com/nlp/part-of-speech-tagging.html
[200_Python]: http://www.hankcs.com/nlp/averaged-perceptron-tagger.html