arbox · joshweir · Mar 28, 2017 · Mar 28, 2017 · Mar 30, 2017 · Mar 31, 2017
diff --git a/lib/tokenizer/tokenizer.rb b/lib/tokenizer/tokenizer.rb
@@ -22,7 +22,20 @@ class WhitespaceTokenizer
     PAIR_POST = [')', '}', ']', '>', '»', '“']
 
     # Characters which can be both prefixes AND suffixes.
-    PRE_N_POST = ['"', "'"]
+    PRE_N_POST = ['"']
+
+    # Characters which can both prefixes and suffixes but are only a splittable
+    # if at the beginning or end of a token with the exception of being prefixed/suffixed
+    # by other splittables.
+    # taking the single quote "'" as a PRE_N_POST_ONLY splittable,
+    # The following would be valid uses as a splittable:
+    # 'test quotes'
+    # 'test quotes'. <- suffixed by another splittable
+    # ('test quotes'). <- prefixed and suffixed by another splittable
+    # The following would not be valid uses as a splittable:
+    # l'interrelation
+    # l'imagerie
+    PRE_N_POST_ONLY = ["'"]
 
     private_constant :FS
 
@@ -49,12 +62,50 @@ def tokenize(str)
 
       splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
       pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
+      pattern_prepostonly_pfix =
+          Regexp.new("^[#{Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*[#{
+          Regexp.escape(PRE_N_POST_ONLY.join)}]+[#{
+          Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*")
+      pattern_prepostonly_sfix =
+          Regexp.new("[#{Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*[#{
+                         Regexp.escape(PRE_N_POST_ONLY.join)}]+[#{
+                         Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*$")
+      #most accomodating url regex I found was here:
+      #http://stackoverflow.com/a/24058129/4852737
+      url_pattern = %r{(([\w]+:)?\/\/)?(([\d\w]|%[a-fA-f\d]{2,2})+
+                       (:([\d\w]|%[a-fA-f\d]{2,2})+)
+                       ?@)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,63}(:[\d]+)?(\/([-+_~.\d\w]
+                       |%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#
+                       ([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?}
       output = []
       tokens.each do |token|
-        prefix, stem, suffix = token.partition(pattern)
-        output << prefix.split('') unless prefix.empty?
-        output << stem unless stem.empty?
-        output << suffix.split('') unless suffix.empty?
+        if url_pattern.match(token)
+          #if token is validated as a url, if last character is a splittable then split it out
+          output << (splittables.include?(token[-1]) ?
+                      [token[0...-1],token[-1]] : token)
+        else
+          #if prefix chars are PRE_N_POST_ONLY splittable then split
+          prefix, stem, suffix = token.partition(pattern_prepostonly_pfix)
+          output << stem.split('') unless stem.empty?
+          token_remaining = stem.empty? ? prefix : suffix
+          prefix, stem, suffix = token_remaining.partition(pattern)
+          output << prefix.split('') unless prefix.empty?
+          unless stem.empty?
+            #if suffix chars are any splittable including PRE_N_POST_ONLY then split
+            prefix, stem, suffix_discard = stem.partition(pattern_prepostonly_sfix)
+            output << prefix unless prefix.empty?
+            output << stem.split('') unless stem.empty?
+          end
+          #while suffix is not empty, take the first character as a splittable token,
+          #and partition remaining suffix
+          while suffix.length > 0
+            prior_suffix = suffix
+            output << suffix[0]
+            prefix, stem, suffix = prior_suffix[1..-1].partition(pattern)
+            output << prefix.split('') unless prefix.empty?
+            output << stem unless stem.empty?
+          end
+        end
       end
 
       output.flatten

diff --git a/test/development_tests/test_tokenize_urls.rb b/test/development_tests/test_tokenize_urls.rb
@@ -0,0 +1,62 @@
+# coding: utf-8
+require 'minitest/autorun'
+require 'minitest/spec'
+require 'tokenizer'
+
+class TestTokenizerUrls < Minitest::Test
+  def setup
+    @t = Tokenizer::WhitespaceTokenizer.new(:de)
+  end
+
+  def test_url_tokenization_001
+    assert_equal(@t.tokenize('test url www.google.com.'),
+                 ['test','url','www.google.com','.'])
+  end
+
+  def test_url_tokenization_002
+    assert_equal(@t.tokenize('test url www.google.com.au.'),
+                 ['test','url','www.google.com.au','.'])
+  end
+
+  def test_url_tokenization_003
+    assert_equal(@t.tokenize('test url http://www.google.com.au.'),
+                 ['test','url','http://www.google.com.au','.'])
+  end
+
+  def test_url_tokenization_004
+    assert_equal(@t.tokenize('test url https://www.google.com.au.'),
+                 ['test','url','https://www.google.com.au','.'])
+  end
+
+  def test_url_tokenization_005
+    assert_equal(@t.tokenize('test url ftp://www.google.com.au.'),
+                 ['test','url','ftp://www.google.com.au','.'])
+  end
+
+  def test_url_tokenization_006
+    assert_equal(@t.tokenize('test url Google.com.'),
+                 ['test','url','Google.com','.'])
+  end
+
+  def test_url_tokenization_007
+    assert_equal(@t.tokenize('test url Au.ac.'),
+                 ['test','url','Au.ac','.'])
+  end
+
+  def test_url_tokenization_008
+    assert_equal(@t.tokenize('test url google.com. Another sentence.'),
+                 ['test','url','google.com','.','Another','sentence','.'])
+  end
+
+  def test_url_tokenization_009
+    assert_equal(@t.tokenize('test url www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html another word.'),
+                 ['test','url','www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html','another','word','.'])
+  end
+
+  def test_url_tokenization_010
+    assert_equal(@t.tokenize('test url  www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html. Another sentence.'),
+                 ['test','url','www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html','.','Another','sentence','.'])
+  end
+end
+
+
diff --git a/test/regression_tests/test_de_tokenizer.rb b/test/regression_tests/test_de_tokenizer.rb
@@ -31,6 +31,69 @@ def test_tokenization_002
     output = @t.tokenize(input)
     assert_equal(etalon, output)
   end
+
+  def test_tokenization_003
+    input = 'Try some code: test(this).'
+    etalon = %w(Try some code : test ( this ) .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_004
+    input = 'Try an email: [email protected].'
+    etalon = %w(Try an email : [email protected] .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_005
+    input = "et souligne 'l'interrelation étroite de l'imagerie' avec le comportement."
+    etalon = %w(et souligne ' l'interrelation étroite de l'imagerie ' avec le comportement .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_006
+    input = 'Try some code: test(inner(brackets)also).'
+    etalon = %w(Try some code : test ( inner ( brackets ) also ) .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_007
+    input = 'Try some code: test[(inner(brackets)also)].'
+    etalon = %w(Try some code : test [ ( inner ( brackets ) also ) ] .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_008
+    input = "Check single quotes: 'quoted string'."
+    etalon = %w(Check single quotes : ' quoted string ' .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_009
+    input = "Check silly embedded single quotes: 'quoted 'embedded string' string'."
+    etalon = %w(Check silly embedded single quotes : ' quoted ' embedded string ' string ' .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_010
+    input = "Check quotes: ('test quotes')."
+    etalon = %w(Check quotes : ( ' test quotes ' ) .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+
+  def test_tokenization_011
+    input = "Check quotes: (''test quotes'')."
+    etalon = %w(Check quotes : ( ' ' test quotes ' ' ) .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
 end
 
 describe Tokenizer do