Sprach- und literaturwissenschaftliche Fakultät - Korpuslinguistik und Morphologie

nonfringe.py

# import fileinput to input over a single file
import fileinput

def merge(tag,bigtag):
    # merge an individual tag into a potentially slashed tag, i.e.,
    # one which keeps track of variations
    if not bigtag:
        return tag

    # add the tag in to bigtag if it's not already there
    if tag != bigtag:
        currentlist = bigtag.split('/')
        if tag not in currentlist:
            currentlist.append(tag)
            currentlist.sort()
            return '/'.join(currentlist)
        return bigtag

    return bigtag

def add_to_taglist(tags,taglist):
    # take a particular sequence of 'tags' and merge them into the
    # overall taglist, which keeps track of all tags over all tag
    # sequences for a variation ngram
    tagspl = tags.split(' ## ')
    for i in range(len(tagspl)):
        tag = tagspl[i]
        taglist[i] = merge(tag,taglist[i])

def nonfringe(i,j):
    # check whether i is a nonfringe position, based on overall length j
    if i > 0 and i < j-1:
        return True
    return False

def main():

    # loop over contents of a single file
    for line in fileinput.input():
        line = line.rstrip()

        # get the line information
        spl = line.split('\t')
        total = spl.pop(0)
        ngram = spl.pop(0)

        # set a taglist to be the right length, but with empty strings
        ngramlist = ngram.split(' ## ')        
        taglist = ['']*len(ngramlist)

        # store all tag sequences
        while spl:
            count = spl.pop(0)
            tags = spl.pop(0)
            add_to_taglist(tags,taglist)

        # after a whole line has been processed, iterate over the tags
        for i in range(len(taglist)):
            tag = taglist[i]

            # 1. '/' in tag checks whether this is a varying tag
            # 2. nonfringe checks whether it's nonfringe
            if '/' in tag and nonfringe(i,len(taglist)):
                word = ngramlist[i]
                # append the varying word & the ambiguity tag to the
                # front of hte line & print it out
                print(word + '\t' + tag + '\t' + line)

if __name__ == "__main__":
    main()