# import fileinput to input over a single file
import fileinput
def merge(tag,bigtag):
# merge an individual tag into a potentially slashed tag, i.e.,
# one which keeps track of variations
if not bigtag:
return tag
# add the tag in to bigtag if it's not already there
if tag != bigtag:
currentlist = bigtag.split('/')
if tag not in currentlist:
currentlist.append(tag)
currentlist.sort()
return '/'.join(currentlist)
return bigtag
return bigtag
def add_to_taglist(tags,taglist):
# take a particular sequence of 'tags' and merge them into the
# overall taglist, which keeps track of all tags over all tag
# sequences for a variation ngram
tagspl = tags.split(' ## ')
for i in range(len(tagspl)):
tag = tagspl[i]
taglist[i] = merge(tag,taglist[i])
def nonfringe(i,j):
# check whether i is a nonfringe position, based on overall length j
if i > 0 and i < j-1:
return True
return False
def main():
# loop over contents of a single file
for line in fileinput.input():
line = line.rstrip()
# get the line information
spl = line.split('\t')
total = spl.pop(0)
ngram = spl.pop(0)
# set a taglist to be the right length, but with empty strings
ngramlist = ngram.split(' ## ')
taglist = ['']*len(ngramlist)
# store all tag sequences
while spl:
count = spl.pop(0)
tags = spl.pop(0)
add_to_taglist(tags,taglist)
# after a whole line has been processed, iterate over the tags
for i in range(len(taglist)):
tag = taglist[i]
# 1. '/' in tag checks whether this is a varying tag
# 2. nonfringe checks whether it's nonfringe
if '/' in tag and nonfringe(i,len(taglist)):
word = ngramlist[i]
# append the varying word & the ambiguity tag to the
# front of hte line & print it out
print(word + '\t' + tag + '\t' + line)
if __name__ == "__main__":
main()