Sprach- und literaturwissenschaftliche Fakultät - Korpuslinguistik und Morphologie

clean-skript.py

#! /usr/bin/env python
#-*- coding: utf-8-*-
# author: vivian voigt
# version: 2.2
#
# 2014-11-16 uwe springmann:
#   reflect transciption changes in dipl: 'ꝰ':'us','ꝝ':'rum','d̉':'der','v̉':'ü'
#   additional rules for tildes instead of macrons in dipl

# 2014-11-17 vivian voigt:
# added 'oͦ':'o' to the replacement-list
# added function that checks for macrons at line seperated token

# 2016-01-12 vivian voigt:
# removed replacement of 'í' to 'i', ,'ű' to 'ü', '$' to 'us' and '€' to 'der'

#2016-03-16 laura perlitz: 
#added replacements of ꝰ to 'us', 'v̉ ' to 'ü' and 'ð' to 'der'

#2016-04-06 laura perlitz:
#deleted all replacements of macrons


import sys, os, re
import StringIO



def searchHyphen(str):
    return (str[-2] == '-')

def replace_all(text, dic):
	for i, j in dic.iteritems():
		text = text.replace(i, j)
	return text

# list of replacements
reps = {'ꝰ':'us', 'v̉ ':'ü', 'ð':'der', 'ſ':'s', 'ů':'u', 'ů':'u', '⸗':'-', 'æ':'ae', 'Æ':'AE', 'œ':'oe', 'Œ':'OE', 'å':'a', 'aͤ':'ä', 'ͤa':'ä', 'äͤ':'ä', 'oͤ':'ö', 'uͤ':'ü','vͤ':'ü', 'Aͤ':'Ä', 'Oͤ':'Ö', 'Uͤ':'Ü', 'Vͤ':'Ü', '˖':':', 'ʒ':'z', 'ȝ':'z', 'v̂':'ü','o̊':'o','oͦ':'o', 'ꝰ':'us','ꝝ':'rum','d̉':'der','v̉':'ü', '℞':'recipe', '℔':'libra', '℥':'uncia', '℈':'scrupel', 'ÿ':'y', 'dᷣ':'der', 'ꝺᷣ':'der'}

def main(argv):
    
    if len(argv) < 3:
        print """usage: ./rp  """
        return

    f = open(argv[1], "r")
    o = open(argv[2], "w")

	
    corpus = f.read()
    
    # contextual replacements
    corpus = re.sub('ñn' , 'nn', corpus)
    corpus = re.sub('ñ' , 'nn', corpus)
    corpus = re.sub('ñ' , 'nn', corpus)
    corpus = re.sub('m̃m' , 'mm', corpus)
    corpus = re.sub('m̃' , 'mm', corpus)
    corpus = re.sub('̃', '̄', corpus)
    corpus = re.sub('̃', '̄', corpus)
    corpus = re.sub('ã', 'ā', corpus)
    corpus = re.sub('ā', 'ā', corpus)
    corpus = re.sub('ē', 'ē', corpus)
    corpus = re.sub('ȳ', 'ȳ', corpus)
    # 2014-11-16 us additional lines: 
    # replace combining tildes (U+0303) and macrons (U+0304) as well; 
    # line 48 above does not work)
    corpus = re.sub('ã', 'ā', corpus)
    corpus = re.sub('ẽ', 'ē', corpus)
    corpus = re.sub('ĩ', 'ī', corpus)
    corpus = re.sub('õ', 'ō', corpus)
    corpus = re.sub('ũ', 'ū', corpus)
    # end additional lines
    corpus = re.sub('õ', 'ō', corpus)
    corpus = re.sub('ũ', 'ū', corpus)

    
	
    txt = replace_all(corpus, reps)
    txt = StringIO.StringIO(txt)
	
    # put all lines into an array,lines = f.readlines()        
    lines = txt.readlines()
    length = len(lines)
	
    i = 0
    while i < length:
        line = lines[i]

        # replacement of token containing macrons by each potential form, separated by '|'
        # do this also for tokens containing
        if '̄' in line:
            if line.count('̄') < 2:
                line = re.sub("\\t(.*)̄(.*)", "\\t\\1m\\2|\\1n\\2", line)
            if line.count('̄') == 2:
                    line = re.sub("\\t(.*)̄(.+)̄\\n", "\\t\\1m\\2m|\\1m\\2n|\\1n\\2m|\\1n\\2n\\n", line)
                    line = re.sub("\\t(.*)̄(.+)̄(.+)", "\\t\\1m\\2m\\3|\\1m\\2n\\3|\\1n\\2m\\3|\\1n\\2n\\3", line)
            if line.count('̄') > 2:
                print ("Please check line "+ str(i+1) + " and replace macrons manually.", line)
                    
                    
        if i < length - 1:
            nextLine = lines[i + 1]
            lineTok = line.split("\t", 1)
        if i < length -2:
            nextLineTok = nextLine.split("\t", 1)
            
        # exists a hyphen and a next line
        if (nextLine.startswith("lb")
            and searchHyphen(line) and not(lineTok[1].startswith("-"))):

        # replacement of token, containing '_' by 'unknown'
            if (nextLineTok[1].isupper() == False):
                newLine = line[0:-2] + re.sub(".*\\t", "", nextLine.lower()) + "lb""\t""\n"
            else:
                newLine = line[0:-2] + re.sub(".*\\t", "", nextLine.upper()) + "lb""\t""\n"
            
            
            # checks, if newline includes '̄' (added)
            if ('̄' in newLine):
                if newLine.count('̄') < 2:
                    newLine = re.sub("\\t(.*)̄(.*)", "\\t\\1m\\2|\\1n\\2", newLine)
                if newLine.count('̄') == 2:
                    newLine = re.sub("\\t(.*)̄(.+)̄\\n", "\\t\\1m\\2m|\\1m\\2n|\\1n\\2m|\\1n\\2n\\n", newLine)
                    newLine = re.sub("\\t(.*)̄(.+)̄(.+)", "\\t\\1m\\2m\\3|\\1m\\2n\\3|\\1n\\2m\\3|\\1n\\2n\\3", newLine)
                if newLine.count('̄') > 2:
                    print ("Please check line "+ str(i+1) + " and replace macrons manually.", newLine)
                    
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', newLine))
            i = i + 2
        else:
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', line))
            i = i + 1
	
	
	o.close	
	f.close
	

if __name__ == "__main__":
    main(sys.argv)