#!/usr/bin/python3 # -*- coding: utf-8 -*- import sys from collections import defaultdict class Parser: def __init__(self, grammarfile, lexfile): self.read_grammar(grammarfile) self.read_lexicon(lexfile) def read_grammar(self, grammarfile): self.ruleprobs = defaultdict(dict) with open(grammarfile) as file: for i, line in enumerate(file): prob, lhs, *rhs = line.split() self.ruleprobs[lhs][tuple(rhs)] = float(prob) if i==0: self.start_symbol = lhs def read_lexicon(self,lexfile): self.lexprobs = defaultdict(dict) with open(lexfile) as file: for line in file: prob, tag, word = line.split() self.lexprobs[word][tag] = float(prob) def scan(self, word, pos): for tag, p in self.lexprobs[word].items(): self.add((tag, (word,), 1, pos, pos+1), p) def predict(self, lhs, pos): for rhs, prob in self.ruleprobs[lhs].items(): self.add((lhs, rhs, 0, pos, pos), prob) def complete(self, dottedrule, prob): cat, _, _, splitpos, endpos = dottedrule for dottedrule2, p in self.vitprob[splitpos].items(): lhs, rhs, dotpos, startpos, _ = dottedrule2 if dotpos < len(rhs) and rhs[dotpos] == cat: self.add((lhs, rhs, dotpos+1, startpos, endpos), prob * p) def add(self, dottedrule, prob): _, rhs, dotpos, _, endpos = dottedrule if self.vitprob[endpos][dottedrule] < prob: self.vitprob[endpos][dottedrule] = prob if dotpos == len(rhs): self.complete(dottedrule, prob) else: self.predict(rhs[dotpos], endpos) def parse(self, words): self.vitprob = [defaultdict(float) for _ in range(len(words)+1)] self.predict(self.start_symbol, 0) for i in range(len(words)): self.scan(words[i], i) bestscore = 0 for (lhs, rhs, dotpos, startpos, endpos), score in self.vitprob[-1].items(): if lhs == self.start_symbol and dotpos == len(rhs) and \ startpos == 0 and bestscore < score: bestscore = score print("Satz:", ' '.join(words)) if bestscore > 0: print("Die Wahrscheinlichkeit des besten Parsebaumes ist", bestscore) # the probability of the best parsetree is else: print("Der Eingabesatz ist ungrammatisch.") # the input sentence is ungrammatical parser = Parser(sys.argv[1], sys.argv[2]) with open(sys.argv[3]) as file: for line in file: parser.parse(line.split())