apertium · share-with-me · Jul 17, 2017 · Jul 26, 2017 · Aug 17, 2017 · Aug 17, 2017
diff --git a/modeSearch.py b/modeSearch.py
@@ -30,13 +30,17 @@ def searchPath(rootpath, include_pairs=True, verbosity=1):
         'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)),
         'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)),
         'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)),
-        'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code))
+        'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)),
+        'spell': re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)),
+        'tokenise': re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code))
     }
     modes = {
         'pair': [],
         'analyzer': [],
         'generator': [],
         'tagger': [],
+        'spell': [],
+        'tokenise': [],
     }
 
     real_root = os.path.abspath(os.path.realpath(rootpath))

diff --git a/servlet.py b/servlet.py
@@ -21,7 +21,7 @@
 from urllib.parse import urlparse, urlunsplit
 import heapq
 from tornado.locks import Semaphore
-
+from streamparser import parse, known
 import tornado
 import tornado.web
 import tornado.httpserver
@@ -98,6 +98,7 @@ class BaseHandler(tornado.web.RequestHandler):
     analyzers = {}
     generators = {}
     taggers = {}
+    spellers = {}
     pipelines = {}  # (l1, l2): [translation.Pipeline], only contains flushing pairs!
     pipelines_holding = []
     callback = None
@@ -282,6 +283,8 @@ def langs(trgt): return src, trgt
             self.sendResponse({pair: modename for (pair, (path, modename)) in self.generators.items()})
         elif query == 'taggers' or query == 'disambiguators':
             self.sendResponse({pair: modename for (pair, (path, modename)) in self.taggers.items()})
+        elif query == 'spellers':
+            self.sendResponse({lang_src: modename for (lang_src, (path, modename)) in self.spellers.items()})
         else:
             self.send_error(400, explanation='Expecting q argument to be one of analysers, generators, disambiguators, or pairs')
 
@@ -769,6 +772,48 @@ def get(self):
             self.send_error(400, explanation='That mode is not installed')
 
 
+class SpellerHandler(BaseHandler):
+
+    @tornado.web.asynchronous
+    @gen.coroutine
+    def get(self):
+        in_text = self.get_argument('q') + '*'
+        in_mode = toAlpha3Code(self.get_argument('lang'))
+        logging.info(in_text)
+        logging.info(in_mode)
+        if in_mode in self.spellers:
+            [path, mode] = self.spellers[in_mode]
+            formatting = 'none'
+            commands = [['apertium', '-d', path, '-f', formatting, self.get_argument('lang')+'-tokenise']]
+            result = yield translation.translateSimple(in_text, commands)
+
+            tokens = parse(result)
+            units = []
+            for token in tokens:
+                if token.knownness == known:
+                    units.append({'token': token.wordform, 'known': True, 'sugg': []})
+                else:
+                    suggestion = []
+                    commands = [['apertium', '-d', path, '-f', formatting, mode]]
+
+                    result = yield translation.translateSimple(token.wordform, commands)
+                    foundSugg = False
+                    for line in result.split('\n'):
+                        if line.count('Corrections for'):
+                            foundSugg = True
+                            continue
+                        if foundSugg and '    ' in line:
+                            s, w = line.split('    ')
+                            suggestion.append((s, w))
+
+                    units.append({'token': token.wordform, 'known': False, 'sugg': suggestion})
+
+            self.sendResponse(units)
+        else:
+            logging.info('Spellchecker not working')
+            self.send_error(404, explanation="{} on spellchecker mode: {}".format('Error 404', 'Mode not installed'))
+
+
 class GenerateHandler(BaseHandler):
 
     def preproc_text(self, in_text):
@@ -1140,6 +1185,9 @@ def setupHandler(
         Handler.generators[lang_pair] = (dirpath, modename)
     for dirpath, modename, lang_pair in modes['tagger']:
         Handler.taggers[lang_pair] = (dirpath, modename)
+    for dirpath, modename, lang_src in modes['spell']:
+        if (any(lang_src == elem[2] for elem in modes['tokenise'])):
+            Handler.spellers[lang_src] = (dirpath, modename)
 
     Handler.initPairsGraph()
     Handler.initPaths()
@@ -1250,7 +1298,8 @@ def sanity_check():
         (r'/identifyLang', IdentifyLangHandler),
         (r'/getLocale', GetLocaleHandler),
         (r'/pipedebug', PipeDebugHandler),
-        (r'/suggest', SuggestionHandler)
+        (r'/suggest', SuggestionHandler),
+        (r'/speller', SpellerHandler)
     ])
 
     if args.bypass_token:

diff --git a/streamparser.py b/streamparser.py
@@ -0,0 +1,193 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Usage: streamparser.py [FILE]
+
+Consumes input from a file (first argument) or stdin, parsing and pretty printing the readings of lexical units found.
+"""
+
+import re
+import pprint
+import sys
+import itertools
+import fileinput
+from collections import namedtuple
+
+
+class Knownness:
+    __doc__ = """Level of knowledge associated with a lexical unit.
+    Values:
+        known
+        unknown: Denoted by '*', analysis not available.
+        biunknown: Denoted by '@', translation not available.
+        genunknown: Denoted by '#', generated form not available.
+"""
+    symbol = ""
+
+
+class known(Knownness):
+    pass
+
+
+class unknown(Knownness):
+    symbol = "*"
+
+
+class biunknown(Knownness):
+    symbol = "@"
+
+
+class genunknown(Knownness):
+    symbol = "#"
+
+
+SReading = namedtuple('SReading', ['baseform', 'tags'])
+try:
+    SReading.__doc__ = """A single subreading of an analysis of a token.
+    Fields:
+        baseform (str): The base form (lemma, lexical form, citation form) of the reading.
+        tags (list of str): The morphological tags associated with the reading.
+"""
+except AttributeError:
+    # Python 3.2 users have to read the source
+    pass
+
+
+def subreadingToString(sub):
+    return sub.baseform+"".join("<"+t+">" for t in sub.tags)
+
+
+def readingToString(reading):
+    return "+".join(subreadingToString(sub) for sub in reading)
+
+
+def mainpos(reading, ltr=False):
+    """Return the first part-of-speech tag of a reading. If there are
+    several subreadings, by default give the first tag of the last
+    subreading. If ltr=True, give the first tag of the first
+    subreading, see
+    http://beta.visl.sdu.dk/cg3/single/#sub-stream-apertium for more
+    information.
+
+    """
+    if ltr:
+        return reading[0].tags[0]
+    else:
+        return reading[-1].tags[0]
+
+
+class LexicalUnit:
+
+    """A lexical unit consisting of a lemma and its readings.
+
+    Attributes:
+        lexicalUnit (str): The lexical unit in Apertium stream format.
+        wordform (str): The word form (surface form) of the lexical unit.
+        readings (list of list of SReading): The analyses of the lexical unit with sublists containing all subreadings.
+        knownness (Knownness): The level of knowledge of the lexical unit.
+    """
+
+    knownness = known
+
+    def __init__(self, lexicalUnit):
+        self.lexicalUnit = lexicalUnit
+
+        cohort = re.split(r'(?<!\\)/', lexicalUnit)
+        self.wordform = cohort[0]
+        readings = cohort[1:]
+
+        self.readings = []
+        for reading in readings:
+            if len(reading) < 1:
+                print("WARNING: Empty readings for {}".format(self.lexicalUnit), file=sys.stderr)
+            elif reading[0] not in '*#@':
+                subreadings = []
+
+                subreadingParts = re.findall(r'([^<]+)((?:<[^>]+>)+)', reading)
+                for subreading in subreadingParts:
+                    baseform = subreading[0].lstrip('+')
+                    tags = re.findall(r'<([^>]+)>', subreading[1])
+
+                    subreadings.append(SReading(baseform=baseform, tags=tags))
+
+                self.readings.append(subreadings)
+            else:
+                self.knownness = {'*': unknown, '@': biunknown, '#': genunknown}[readings[0][0]]
+
+    def __repr__(self):
+        return self.lexicalUnit
+
+
+def parse(stream, withText=False):
+    """Generates lexical units from a character stream.
+
+    Args:
+        stream (iterable): A character stream containing lexical units, superblanks and other text.
+        withText (bool, optional): A boolean defining whether to output preceding text with each lexical unit.
+
+    Yields:
+        LexicalUnit: The next lexical unit found in the character stream. (if withText is False)
+        (str, LexicalUnit): The next lexical unit found in the character stream and the the text that seperated it from the prior unit in a tuple. (if withText is True)
+    """
+
+    buffer = ''
+    textBuffer = ''
+    inLexicalUnit = False
+    inSuperblank = False
+
+    for char in stream:
+
+        if inSuperblank:
+            if char == ']':
+                inSuperblank = False
+                textBuffer += char
+            elif char == '\\':
+                textBuffer += char
+                textBuffer += next(stream)
+            else:
+                textBuffer += char
+        elif inLexicalUnit:
+            if char == '$':
+                if withText:
+                    yield (textBuffer, LexicalUnit(buffer))
+                else:
+                    yield LexicalUnit(buffer)
+                buffer = ''
+                textBuffer = ''
+                inLexicalUnit = False
+            elif char == '\\':
+                buffer += char
+                buffer += next(stream)
+            else:
+                buffer += char
+        else:
+            if char == '[':
+                inSuperblank = True
+                textBuffer += char
+            elif char == '^':
+                inLexicalUnit = True
+            elif char == '\\':
+                textBuffer += char
+                textBuffer += next(stream)
+            else:
+                textBuffer += char
+
+
+def parse_file(f, withText=False):
+    """Generates lexical units from a file.
+
+    Args:
+        f (file): A file containing lexical units, superblanks and other text.
+
+    Yields:
+        LexicalUnit: The next lexical unit found in the file.
+    """
+
+    return parse(itertools.chain.from_iterable(f), withText)
+
+
+if __name__ == '__main__':
+    lexicalUnits = parse_file(fileinput.input())
+
+    for lexicalUnit in lexicalUnits:
+        pprint.pprint(lexicalUnit.readings, width=120)