clean-skript_V3.py

Korpuslinguistik und Morphologie | clean-skript_V3.py
clean-skript_V3.py

#! /usr/bin/env python
#-*- coding: utf-8-*-
# author: vivian voigt
# version: 3.0
#
# 2014-11-16 uwe springmann:
#   reflect transciption changes in dipl: 'ꝰ':'us','ꝝ':'rum','d̉':'der','v̉':'ü'
#   additional rules for tildes instead of macrons in dipl

# 2014-11-17 vivian voigt:
# added 'oͦ':'o' to the replacement-list
# added function that checks for macrons at line seperated token

# 2016-01-12 vivian voigt:
# removed replacement of 'í' to 'i', ,'ű' to 'ü', '$' to 'us' and '€' to 'der'

#2016-03-16 laura perlitz: 
#added replacements of 'ꝰ' to 'us', 'v̉ ' to 'ü' and 'ð' to 'der'

#2016-04-06 laura perlitz:
#deleted all replacements of macrons

# 2016-05-30 vivian voigt:
# added a list of different forms of "und" and "or" (truncHelper), to differ between words that are seperated because of a linebreak and words that are seperated because of truncs

# 2016-06-02 vivian voigt:
# made the script upward compatible with python 3.x (tested with 3.4.3)


import sys, re
try:
    import StringIO
except ImportError:
    import io as StringIO


def searchHyphen(str):
    return (str[-2] == '-')

def replace_all(text, dic):
	for i, j in dic.items():
		text = text.replace(i, j)
	return text

# list of replacements
reps = {'ꝰ':'us', 'v̉ ':'ü', 'ð':'der', 'ſ':'s', 'ů':'u', 'ů':'u', '⸗':'-', 'æ':'ae', 'Æ':'AE', 'œ':'oe', 'Œ':'OE', 'å':'a', 'aͤ':'ä', 'ͤa':'ä', 'äͤ':'ä', 'oͤ':'ö', 'uͤ':'ü','vͤ':'ü', 'Aͤ':'Ä', 'Oͤ':'Ö', 'Uͤ':'Ü', 'Vͤ':'Ü', '˖':':', 'ʒ':'z', 'ȝ':'z', 'v̂':'ü','o̊':'o','oͦ':'o', 'ꝰ':'us','ꝝ':'rum','d̉':'der','v̉':'ü', '℞':'recipe', '℔':'libra', '℥':'uncia', '℈':'scrupel', 'ÿ':'y', 'dᷣ':'der', 'ꝺᷣ':'der'}
truncHelper = {'und', 'Und', 'oder', 'Oder', 'vnd', 'Vnd', 'vnnd', 'Vnnd', 'vñ', 'Vñ', 'vn̄', 'Vn̄', 'oð', 'Oð', 'odder', 'Odder', 'vn', 'Vn', 'od̉', 'Od̉', 'unnd', 'Unnd', 'undt', 'Undt', 'uñ', 'Uñ', 'vund', 'Vund', 'uud', 'Uud', 'Vud', 'vud', 'nnd'}

def main(argv):
    
    if len(argv) < 3:
        print("usage: ./clean.py  ")
        return

    f = open(argv[1], "r")
    o = open(argv[2], "w")

	
    corpus = f.read()
    
    # contextual replacements
    corpus = re.sub('ñn' , 'nn', corpus)
    corpus = re.sub('ñ' , 'nn', corpus)
    corpus = re.sub('ñ' , 'nn', corpus)
    corpus = re.sub('m̃m' , 'mm', corpus)
    corpus = re.sub('m̃' , 'mm', corpus)
    corpus = re.sub('̃', '̄', corpus)
    corpus = re.sub('̃', '̄', corpus)
    corpus = re.sub('ã', 'ā', corpus)
    corpus = re.sub('ā', 'ā', corpus)
    corpus = re.sub('ē', 'ē', corpus)
    corpus = re.sub('ȳ', 'ȳ', corpus)
    # 2014-11-16 us additional lines: 
    # replace combining tildes (U+0303) and macrons (U+0304) as well; 
    # line 48 above does not work)
    corpus = re.sub('ã', 'ā', corpus)
    corpus = re.sub('ẽ', 'ē', corpus)
    corpus = re.sub('ĩ', 'ī', corpus)
    corpus = re.sub('õ', 'ō', corpus)
    corpus = re.sub('ũ', 'ū', corpus)
    # end additional lines
    corpus = re.sub('õ', 'ō', corpus)
    corpus = re.sub('ũ', 'ū', corpus)

    
	
    txt = replace_all(corpus, reps)
    txt = StringIO.StringIO(txt)
	
    # put all lines into an array,lines = f.readlines()        
    lines = txt.readlines()
    length = len(lines)
	
    i = 0
    while i < length:
        line = lines[i]

        # replacement of token containing macrons by each potential form, separated by '|'
        # do this also for tokens containing
        if '̄' in line:
            if line.count('̄') < 2:
                line = re.sub("\\t(.*)̄(.*)", "\\t\\1m\\2|\\1n\\2", line)
            if line.count('̄') == 2:
                    line = re.sub("\\t(.*)̄(.+)̄\\n", "\\t\\1m\\2m|\\1m\\2n|\\1n\\2m|\\1n\\2n\\n", line)
                    line = re.sub("\\t(.*)̄(.+)̄(.+)", "\\t\\1m\\2m\\3|\\1m\\2n\\3|\\1n\\2m\\3|\\1n\\2n\\3", line)
            if line.count('̄') > 2:
                print ("Please check line "+ str(i+1) + " and replace macrons manually.", line)
                    
                    
        if i < length - 1:
            nextLine = lines[i + 1]
            lineTok = line.split("\t", 1)
        if i < length -2:
            nextLineTok = nextLine.split("\t", 1)
        
        # exists a hyphen, a next line, and no form of "und" or "oder and the token in the next line does not start with "-"
        if (nextLine.startswith("lb")
            and searchHyphen(line) and not(nextLineTok[1].startswith("-")) and not(lineTok[1].startswith("-")) and nextLineTok[1].rstrip() not in truncHelper):
            if (nextLineTok[1].isupper() == False):
                newLine = line[0:-2] + re.sub(".*\\t", "", nextLine.lower()) + "lb""\t""\n"
            else:
                newLine = line[0:-2] + re.sub(".*\\t", "", nextLine.upper()) + "lb""\t""\n"
            
            
            # checks, if newline includes '̄' (added)
            if ('̄' in newLine):
                if newLine.count('̄') < 2:
                    newLine = re.sub("\\t(.*)̄(.*)", "\\t\\1m\\2|\\1n\\2", newLine)
                if newLine.count('̄') == 2:
                    newLine = re.sub("\\t(.*)̄(.+)̄\\n", "\\t\\1m\\2m|\\1m\\2n|\\1n\\2m|\\1n\\2n\\n", newLine)
                    newLine = re.sub("\\t(.*)̄(.+)̄(.+)", "\\t\\1m\\2m\\3|\\1m\\2n\\3|\\1n\\2m\\3|\\1n\\2n\\3", newLine)
                if newLine.count('̄') > 2:
                    print ("Please check line "+ str(i+1) + " and replace macrons manually.", newLine)
        # replacement of token, containing '_' by 'unknown'                    
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', newLine))
            i = i + 2
        else:
            o.write(re.sub('\t''(.*)_(.*)', '\t''unknown', line))
            i = i + 1
	
	
    o.close	
    f.close
	

if __name__ == "__main__":
    main(sys.argv)
Sprach- und literaturwissenschaftliche Fakultät - Korpuslinguistik und Morphologie