diff --git a/lang_data/en/infix.txt b/lang_data/en/infix.txt index 8c83b7d4cd4..aa36da8e9cd 100644 --- a/lang_data/en/infix.txt +++ b/lang_data/en/infix.txt @@ -3,3 +3,4 @@ (?<=[a-zA-Z])-(?=[a-zA-z]) (?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) +(?<=[A-Za-z]),(?=[A-Za-z]) diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index 4edc031d756..351394021ad 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -47,3 +47,10 @@ def test_double_hyphen(en_tokenizer): assert tokens[8].text == u'--' assert tokens[9].text == u'people' + +def test_infix_comma(en_tokenizer): + # Re issue #326 + tokens = en_tokenizer(u'Hello,world') + assert tokens[0].text == u'Hello' + assert tokens[1].text == u',' + assert tokens[2].text == u'world'