Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 56 additions & 5 deletions lib/tokenizer/tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,20 @@ class WhitespaceTokenizer
PAIR_POST = [')', '}', ']', '>', '»', '“']

# Characters which can be both prefixes AND suffixes.
PRE_N_POST = ['"', "'"]
PRE_N_POST = ['"']

# Characters which can both prefixes and suffixes but are only a splittable
# if at the beginning or end of a token with the exception of being prefixed/suffixed
# by other splittables.
# taking the single quote "'" as a PRE_N_POST_ONLY splittable,
# The following would be valid uses as a splittable:
# 'test quotes'
# 'test quotes'. <- suffixed by another splittable
# ('test quotes'). <- prefixed and suffixed by another splittable
# The following would not be valid uses as a splittable:
# l'interrelation
# l'imagerie
PRE_N_POST_ONLY = ["'"]

private_constant :FS

Expand All @@ -49,12 +62,50 @@ def tokenize(str)

splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
pattern_prepostonly_pfix =
Regexp.new("^[#{Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*[#{
Regexp.escape(PRE_N_POST_ONLY.join)}]+[#{
Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*")
pattern_prepostonly_sfix =
Regexp.new("[#{Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*[#{
Regexp.escape(PRE_N_POST_ONLY.join)}]+[#{
Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*$")
#most accomodating url regex I found was here:
#http://stackoverflow.com/a/24058129/4852737
url_pattern = %r{(([\w]+:)?\/\/)?(([\d\w]|%[a-fA-f\d]{2,2})+
(:([\d\w]|%[a-fA-f\d]{2,2})+)
?@)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,63}(:[\d]+)?(\/([-+_~.\d\w]
|%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#
([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?}
output = []
tokens.each do |token|
prefix, stem, suffix = token.partition(pattern)
output << prefix.split('') unless prefix.empty?
output << stem unless stem.empty?
output << suffix.split('') unless suffix.empty?
if url_pattern.match(token)
#if token is validated as a url, if last character is a splittable then split it out
output << (splittables.include?(token[-1]) ?
[token[0...-1],token[-1]] : token)
else
#if prefix chars are PRE_N_POST_ONLY splittable then split
prefix, stem, suffix = token.partition(pattern_prepostonly_pfix)
output << stem.split('') unless stem.empty?
token_remaining = stem.empty? ? prefix : suffix
prefix, stem, suffix = token_remaining.partition(pattern)
output << prefix.split('') unless prefix.empty?
unless stem.empty?
#if suffix chars are any splittable including PRE_N_POST_ONLY then split
prefix, stem, suffix_discard = stem.partition(pattern_prepostonly_sfix)
output << prefix unless prefix.empty?
output << stem.split('') unless stem.empty?
end
#while suffix is not empty, take the first character as a splittable token,
#and partition remaining suffix
while suffix.length > 0
prior_suffix = suffix
output << suffix[0]
prefix, stem, suffix = prior_suffix[1..-1].partition(pattern)
output << prefix.split('') unless prefix.empty?
output << stem unless stem.empty?
end
end
end

output.flatten
Expand Down
62 changes: 62 additions & 0 deletions test/development_tests/test_tokenize_urls.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# coding: utf-8
require 'minitest/autorun'
require 'minitest/spec'
require 'tokenizer'

class TestTokenizerUrls < Minitest::Test
def setup
@t = Tokenizer::WhitespaceTokenizer.new(:de)
end

def test_url_tokenization_001
assert_equal(@t.tokenize('test url www.google.com.'),
['test','url','www.google.com','.'])
end

def test_url_tokenization_002
assert_equal(@t.tokenize('test url www.google.com.au.'),
['test','url','www.google.com.au','.'])
end

def test_url_tokenization_003
assert_equal(@t.tokenize('test url http://www.google.com.au.'),
['test','url','http://www.google.com.au','.'])
end

def test_url_tokenization_004
assert_equal(@t.tokenize('test url https://www.google.com.au.'),
['test','url','https://www.google.com.au','.'])
end

def test_url_tokenization_005
assert_equal(@t.tokenize('test url ftp://www.google.com.au.'),
['test','url','ftp://www.google.com.au','.'])
end

def test_url_tokenization_006
assert_equal(@t.tokenize('test url Google.com.'),
['test','url','Google.com','.'])
end

def test_url_tokenization_007
assert_equal(@t.tokenize('test url Au.ac.'),
['test','url','Au.ac','.'])
end

def test_url_tokenization_008
assert_equal(@t.tokenize('test url google.com. Another sentence.'),
['test','url','google.com','.','Another','sentence','.'])
end

def test_url_tokenization_009
assert_equal(@t.tokenize('test url www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html another word.'),
['test','url','www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html','another','word','.'])
end

def test_url_tokenization_010
assert_equal(@t.tokenize('test url www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html. Another sentence.'),
['test','url','www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html','.','Another','sentence','.'])
end
end


63 changes: 63 additions & 0 deletions test/regression_tests/test_de_tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,69 @@ def test_tokenization_002
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_003
input = 'Try some code: test(this).'
etalon = %w(Try some code : test ( this ) .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_004
input = 'Try an email: [email protected].'
etalon = %w(Try an email : [email protected] .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_005
input = "et souligne 'l'interrelation étroite de l'imagerie' avec le comportement."
etalon = %w(et souligne ' l'interrelation étroite de l'imagerie ' avec le comportement .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_006
input = 'Try some code: test(inner(brackets)also).'
etalon = %w(Try some code : test ( inner ( brackets ) also ) .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_007
input = 'Try some code: test[(inner(brackets)also)].'
etalon = %w(Try some code : test [ ( inner ( brackets ) also ) ] .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_008
input = "Check single quotes: 'quoted string'."
etalon = %w(Check single quotes : ' quoted string ' .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_009
input = "Check silly embedded single quotes: 'quoted 'embedded string' string'."
etalon = %w(Check silly embedded single quotes : ' quoted ' embedded string ' string ' .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_010
input = "Check quotes: ('test quotes')."
etalon = %w(Check quotes : ( ' test quotes ' ) .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end

def test_tokenization_011
input = "Check quotes: (''test quotes'')."
etalon = %w(Check quotes : ( ' ' test quotes ' ' ) .)
output = @t.tokenize(input)
assert_equal(etalon, output)
end
end

describe Tokenizer do
Expand Down