Commit fdfa406a authored by Dos Santos David's avatar Dos Santos David

apply same word separator for the tokenizer

parent 8bcfab93
from itertools import chain
import re
from gogole.tokenizer.tokenizer import Tokenizer
......@@ -7,8 +8,6 @@ class SimpleTokenizer(Tokenizer):
Simple tokenizer using any space or punctuation sign as separator
"""
SEPARATORS = [" ", ".", ",", "!", "?", ":", ";", "\n", "(", ")"]
STOP_WORDS_LOCATION = 'data/common_words'
def __init__(self):
......@@ -22,12 +21,12 @@ class SimpleTokenizer(Tokenizer):
def get_tokens(self, document: 'gogole.document.Document'):
tokens = [document.get_raw_content()]
raw_content = document.get_raw_content()
#
# 1/ split by any separator
#
words = self._split_strings(tokens)
words = self._split_strings(raw_content)
#
# 2/ use lower case
......@@ -42,8 +41,6 @@ class SimpleTokenizer(Tokenizer):
return filtered_words
def _split_strings(self, tokens):
for separator in self.SEPARATORS:
tokens = chain(*[t.split(separator) for t in tokens])
def _split_strings(self, content: str):
return re.sub("[^\w]", " ", content).split()
return [t for t in tokens if t != ""]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment