Sprach- und literaturwissenschaftliche Fakultät - Korpuslinguistik und Morphologie

cleanV2.py

#! /usr/bin/env python
#-*- coding: utf-8-*-
# author: vivian voigt

import sys, os, re
import StringIO



def searchHyphen(str):
    return (str[-2] == '-')

def replace_all(text, dic):
	for i, j in dic.iteritems():
		text = text.replace(i, j)
	return text

# list of replacements
reps = {'$':'us', 'í':'i', 'ſ':'s', 'ů':'u', 'ů':'u', '€':'der', '⸗':'-', 'æ':'ae', 'Æ':'AE', 'œ':'oe', 'Œ':'OE', 'å':'a', 'aͤ':'ä', 'ͤa':'ä', 'äͤ':'ä', 'oͤ':'ö', 'uͤ':'ü','vͤ':'ü', 'Aͤ':'Ä', 'Oͤ':'Ö', 'Uͤ':'Ü', 'Vͤ':'Ü', '˖':':', 'ʒ':'z', 'ȝ':'z', 'v̂':'ü','ű':'ü','o̊':'o'}

def main(argv):
    
    if len(argv) < 3:
        print """usage: ./rp  """
        return

    f = open(argv[1], "r")
    o = open(argv[2], "w")

	
    corpus = f.read()
    
    # contextual replacements
    corpus = re.sub('n̄n' , 'nn', corpus)
    corpus = re.sub('ñn' , 'nn', corpus)
    corpus = re.sub('ñ' , 'nn', corpus)
    corpus = re.sub('ñ' , 'nn', corpus)
    corpus = re.sub('n̄' , 'nn', corpus)
    corpus = re.sub('m̄m' , 'mm', corpus)
    corpus = re.sub('m̄' , 'mm', corpus)
    corpus = re.sub('m̃m' , 'mm', corpus)
    corpus = re.sub('m̃' , 'mm', corpus)
    corpus = re.sub('̃', '̄', corpus)
    corpus = re.sub('ã', 'ā', corpus)
    corpus = re.sub('ā', 'ā', corpus)
    corpus = re.sub('ē', 'ē', corpus)
    corpus = re.sub('ī', 'ī', corpus)
    corpus = re.sub('õ', 'ō', corpus)
    corpus = re.sub('ō', 'ō', corpus)
    corpus = re.sub('ū', 'ū', corpus)
    corpus = re.sub('ũ', 'ū', corpus)

    
	
    txt = replace_all(corpus, reps)
    txt = StringIO.StringIO(txt)
	
    # put all lines into an array,lines = f.readlines()        
    lines = txt.readlines()
    length = len(lines)
	
    i = 0
    while i < length:
        line = lines[i]

        # replacement of token containing macrons by each potential form, separated by '|'
        if '̄' in line:
            if line.count('̄') < 2:
                line = re.sub("\\t(.*)̄(.*)", "\\t\\1m\\2|\\1n\\2", line)
            if line.count('̄') == 2:
                    line = re.sub("\\t(.*)̄(.+)̄\\n", "\\t\\1m\\2m|\\1m\\2n|\\1n\\2m|\\1n\\2n\\n", line)
                    line = re.sub("\\t(.*)̄(.+)̄(.+)", "\\t\\1m\\2m\\3|\\1m\\2n\\3|\\1n\\2m\\3|\\1n\\2n\\3", line)
            if line.count('̄') > 2:
                print (i+1, line)
                    
                    
        if i < length - 1:
            nextLine = lines[i + 1]
            
        # exists a hyphen and a next line
        if (nextLine.startswith("lb")
            and searchHyphen(line)):

        # replacement of token, containing '_' by 'unknown'
            newLine = line[0:-2] + re.sub(".*\\t", "", nextLine.lower()) + "lb""\t""\n"
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', newLine))
            i = i + 2
        else:
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', line))
            i = i + 1
	
	
	o.close	
	f.close
	

if __name__ == "__main__":
    main(sys.argv)