Sprach- und literaturwissenschaftliche Fakultät - Korpuslinguistik und Morphologie

clean.py

#! /usr/bin/env python
#-*- coding: utf-8-*-

import sys, os, re
import StringIO



def searchHyphen(str):
    return (str[-2] == '-')

def replace_all(text, dic):
	for i, j in dic.iteritems():
		text = text.replace(i, j)
	return text
reps = {'ã':'an', 'ā':'an', 'ñ':'nn', 'n̄':'nn', 'ā':'an', 'Ũ':'Um', 'õ':'on', 'Õ':'On', 'ẽ':'en','ē':'en','ē':'en', 'Ẽ':'En', 'ĩ':'in', 'Ĩ':'In', 'ſ':'s', 'ů':'u', '€':'der','ů':'u', '⸗':'-', 'æ':'ae', 'Æ':'AE', 'œ':'oe', 'Œ':'OE', 'å':'a', 'aͤ':'ä', 'oͤ':'ö', 'uͤ':'ü','vͤ':'ü', 'Aͤ':'Ä', 'Oͤ':'Ö', 'ñ':'nn', 'Uͤ':'Ü', 'Vͤ':'Ü'}


def main(argv):

    if len(argv) < 3:
        print """usage: ./rp  """
        return

    f = open(argv[1], "r")
    o = open(argv[2], "w")

    corpus = f.read()
    corpus = re.sub('ũg' , 'ung', corpus)
    corpus = re.sub('ũ' , 'um', corpus)
    corpus = re.sub('n̄n' , 'nn', corpus)
    corpus = re.sub('n̄' , 'nn', corpus)
    corpus = re.sub('m̄m' , 'mm', corpus)
    corpus = re.sub('m̄' , 'mm', corpus)
	
    txt = replace_all(corpus, reps)
    txt = StringIO.StringIO(txt)
	
    # put all lines into an array,lines = f.readlines()
    lines = txt.readlines()
    length = len(lines)
	
    i = 0
    while i < length:

        line = lines[i]
        if i < length - 1:
            nextLine = lines[i + 1]
            
        # exists a hyphen and a next line
        if (nextLine.startswith("lb")
            and searchHyphen(line)):

            newLine = line[0:-2] + re.sub(".*\\t", "", nextLine.lower()) + "lb""\t""\n"
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', newLine))
            i = i + 2
        else:
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', line))
            i = i + 1
	
	
	o.close	
	f.close
	

if __name__ == "__main__":
    main(sys.argv)