simple_tokenizer.py 1.33 KB
Newer Older
Dos Santos David's avatar
Dos Santos David committed
1 2 3 4 5 6 7 8 9
from itertools import chain

from gogole.tokenizer.abstract_tokenizer import AbstractTokenizer

class SimpleTokenizer(AbstractTokenizer):
    """
    Simple tokenizer using any space or punctuation sign as separator
    """

Dos Santos David's avatar
Dos Santos David committed
10
    SEPARATORS = [" ", ".", ",", "!", "?", ":", ";", "\n", "(", ")"]
Dos Santos David's avatar
Dos Santos David committed
11

Dos Santos David's avatar
Dos Santos David committed
12 13 14 15 16 17 18 19 20 21 22 23 24
    def __init__(self, stop_words_filename=None):
        self.stop_words_filename = stop_words_filename

        self._stop_words = set()
        if stop_words_filename is not None:
            self.load_stop_words()

    def load_stop_words(self):
        with open(self.stop_words_filename, 'r') as f:
            self._stop_words = set([word.strip() for word in f.readlines()])


    def get_tokens(self, document: 'gogole.document.Document'):
25
        tokens = [document.get_raw_content()]
Dos Santos David's avatar
Dos Santos David committed
26

Dos Santos David's avatar
Dos Santos David committed
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
        #
        # 1/ split by any separator
        #
        words = self._split_strings(tokens)

        #
        # 2/ use lower case
        #
        words = [x.lower() for x in words]

        #
        # 3/ filter words using to a stop-word list
        #
        filtered_words = [word for word in words if word not in self._stop_words]

        return filtered_words

Dos Santos David's avatar
Dos Santos David committed
44

Dos Santos David's avatar
Dos Santos David committed
45 46 47
    def _split_strings(self, tokens):
        for separator in self.SEPARATORS:
            tokens = chain(*[t.split(separator) for t in tokens])
Dos Santos David's avatar
Dos Santos David committed
48

Dos Santos David's avatar
Dos Santos David committed
49
        return [t for t in tokens if t != ""]