Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion modeSearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@ def searchPath(rootpath, include_pairs=True, verbosity=1):
'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)),
'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)),
'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)),
'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code))
'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)),
'spell': re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)),
'tokenise': re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code))
}
modes = {
'pair': [],
'analyzer': [],
'generator': [],
'tagger': [],
'spell': [],
'tokenise': [],
}

real_root = os.path.abspath(os.path.realpath(rootpath))
Expand Down
53 changes: 51 additions & 2 deletions servlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from urllib.parse import urlparse, urlunsplit
import heapq
from tornado.locks import Semaphore

from streamparser import parse, known
import tornado
import tornado.web
import tornado.httpserver
Expand Down Expand Up @@ -98,6 +98,7 @@ class BaseHandler(tornado.web.RequestHandler):
analyzers = {}
generators = {}
taggers = {}
spellers = {}
pipelines = {} # (l1, l2): [translation.Pipeline], only contains flushing pairs!
pipelines_holding = []
callback = None
Expand Down Expand Up @@ -282,6 +283,8 @@ def langs(trgt): return src, trgt
self.sendResponse({pair: modename for (pair, (path, modename)) in self.generators.items()})
elif query == 'taggers' or query == 'disambiguators':
self.sendResponse({pair: modename for (pair, (path, modename)) in self.taggers.items()})
elif query == 'spellers':
self.sendResponse({lang_src: modename for (lang_src, (path, modename)) in self.spellers.items()})
else:
self.send_error(400, explanation='Expecting q argument to be one of analysers, generators, disambiguators, or pairs')

Expand Down Expand Up @@ -769,6 +772,48 @@ def get(self):
self.send_error(400, explanation='That mode is not installed')


class SpellerHandler(BaseHandler):

@tornado.web.asynchronous
@gen.coroutine
def get(self):
in_text = self.get_argument('q') + '*'
in_mode = toAlpha3Code(self.get_argument('lang'))
logging.info(in_text)
logging.info(in_mode)
if in_mode in self.spellers:
[path, mode] = self.spellers[in_mode]
formatting = 'none'
commands = [['apertium', '-d', path, '-f', formatting, self.get_argument('lang')+'-tokenise']]
result = yield translation.translateSimple(in_text, commands)

tokens = parse(result)
units = []
for token in tokens:
if token.knownness == known:
units.append({'token': token.wordform, 'known': True, 'sugg': []})
else:
suggestion = []
commands = [['apertium', '-d', path, '-f', formatting, mode]]

result = yield translation.translateSimple(token.wordform, commands)
foundSugg = False
for line in result.split('\n'):
if line.count('Corrections for'):
foundSugg = True
continue
if foundSugg and ' ' in line:
s, w = line.split(' ')
suggestion.append((s, w))

units.append({'token': token.wordform, 'known': False, 'sugg': suggestion})

self.sendResponse(units)
else:
logging.info('Spellchecker not working')
self.send_error(404, explanation="{} on spellchecker mode: {}".format('Error 404', 'Mode not installed'))


class GenerateHandler(BaseHandler):

def preproc_text(self, in_text):
Expand Down Expand Up @@ -1140,6 +1185,9 @@ def setupHandler(
Handler.generators[lang_pair] = (dirpath, modename)
for dirpath, modename, lang_pair in modes['tagger']:
Handler.taggers[lang_pair] = (dirpath, modename)
for dirpath, modename, lang_src in modes['spell']:
if (any(lang_src == elem[2] for elem in modes['tokenise'])):
Handler.spellers[lang_src] = (dirpath, modename)

Handler.initPairsGraph()
Handler.initPaths()
Expand Down Expand Up @@ -1250,7 +1298,8 @@ def sanity_check():
(r'/identifyLang', IdentifyLangHandler),
(r'/getLocale', GetLocaleHandler),
(r'/pipedebug', PipeDebugHandler),
(r'/suggest', SuggestionHandler)
(r'/suggest', SuggestionHandler),
(r'/speller', SpellerHandler)
])

if args.bypass_token:
Expand Down
193 changes: 193 additions & 0 deletions streamparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Usage: streamparser.py [FILE]

Consumes input from a file (first argument) or stdin, parsing and pretty printing the readings of lexical units found.
"""

import re
import pprint
import sys
import itertools
import fileinput
from collections import namedtuple


class Knownness:
__doc__ = """Level of knowledge associated with a lexical unit.
Values:
known
unknown: Denoted by '*', analysis not available.
biunknown: Denoted by '@', translation not available.
genunknown: Denoted by '#', generated form not available.
"""
symbol = ""


class known(Knownness):
pass


class unknown(Knownness):
symbol = "*"


class biunknown(Knownness):
symbol = "@"


class genunknown(Knownness):
symbol = "#"


SReading = namedtuple('SReading', ['baseform', 'tags'])
try:
SReading.__doc__ = """A single subreading of an analysis of a token.
Fields:
baseform (str): The base form (lemma, lexical form, citation form) of the reading.
tags (list of str): The morphological tags associated with the reading.
"""
except AttributeError:
# Python 3.2 users have to read the source
pass


def subreadingToString(sub):
return sub.baseform+"".join("<"+t+">" for t in sub.tags)


def readingToString(reading):
return "+".join(subreadingToString(sub) for sub in reading)


def mainpos(reading, ltr=False):
"""Return the first part-of-speech tag of a reading. If there are
several subreadings, by default give the first tag of the last
subreading. If ltr=True, give the first tag of the first
subreading, see
http://beta.visl.sdu.dk/cg3/single/#sub-stream-apertium for more
information.

"""
if ltr:
return reading[0].tags[0]
else:
return reading[-1].tags[0]


class LexicalUnit:

"""A lexical unit consisting of a lemma and its readings.

Attributes:
lexicalUnit (str): The lexical unit in Apertium stream format.
wordform (str): The word form (surface form) of the lexical unit.
readings (list of list of SReading): The analyses of the lexical unit with sublists containing all subreadings.
knownness (Knownness): The level of knowledge of the lexical unit.
"""

knownness = known

def __init__(self, lexicalUnit):
self.lexicalUnit = lexicalUnit

cohort = re.split(r'(?<!\\)/', lexicalUnit)
self.wordform = cohort[0]
readings = cohort[1:]

self.readings = []
for reading in readings:
if len(reading) < 1:
print("WARNING: Empty readings for {}".format(self.lexicalUnit), file=sys.stderr)
elif reading[0] not in '*#@':
subreadings = []

subreadingParts = re.findall(r'([^<]+)((?:<[^>]+>)+)', reading)
for subreading in subreadingParts:
baseform = subreading[0].lstrip('+')
tags = re.findall(r'<([^>]+)>', subreading[1])

subreadings.append(SReading(baseform=baseform, tags=tags))

self.readings.append(subreadings)
else:
self.knownness = {'*': unknown, '@': biunknown, '#': genunknown}[readings[0][0]]

def __repr__(self):
return self.lexicalUnit


def parse(stream, withText=False):
"""Generates lexical units from a character stream.

Args:
stream (iterable): A character stream containing lexical units, superblanks and other text.
withText (bool, optional): A boolean defining whether to output preceding text with each lexical unit.

Yields:
LexicalUnit: The next lexical unit found in the character stream. (if withText is False)
(str, LexicalUnit): The next lexical unit found in the character stream and the the text that seperated it from the prior unit in a tuple. (if withText is True)
"""

buffer = ''
textBuffer = ''
inLexicalUnit = False
inSuperblank = False

for char in stream:

if inSuperblank:
if char == ']':
inSuperblank = False
textBuffer += char
elif char == '\\':
textBuffer += char
textBuffer += next(stream)
else:
textBuffer += char
elif inLexicalUnit:
if char == '$':
if withText:
yield (textBuffer, LexicalUnit(buffer))
else:
yield LexicalUnit(buffer)
buffer = ''
textBuffer = ''
inLexicalUnit = False
elif char == '\\':
buffer += char
buffer += next(stream)
else:
buffer += char
else:
if char == '[':
inSuperblank = True
textBuffer += char
elif char == '^':
inLexicalUnit = True
elif char == '\\':
textBuffer += char
textBuffer += next(stream)
else:
textBuffer += char


def parse_file(f, withText=False):
"""Generates lexical units from a file.

Args:
f (file): A file containing lexical units, superblanks and other text.

Yields:
LexicalUnit: The next lexical unit found in the file.
"""

return parse(itertools.chain.from_iterable(f), withText)


if __name__ == '__main__':
lexicalUnits = parse_file(fileinput.input())

for lexicalUnit in lexicalUnits:
pprint.pprint(lexicalUnit.readings, width=120)