Commit 5a91324b authored by Dos Santos David's avatar Dos Santos David

rename AbstractTokenizer to Tokenizer

parent 7fc98aa9
class AbstractTokenizer:
"""
Abstract Tokenizer
"""
@staticmethod
def get_tokens(document: 'Document'):
raise Error("Method get_tokens not implemented")
from gogole.tokenizer.abstract_tokenizer import AbstractTokenizer
import collections
class NoTokenizer(AbstractTokenizer):
from gogole.tokenizer.tokenizer import Tokenizer
class NoTokenizer(Tokenizer):
def get_tokens(self, document):
return document.get_raw_content().strip().split()
from itertools import chain
from gogole.tokenizer.abstract_tokenizer import AbstractTokenizer
from gogole.tokenizer.tokenizer import Tokenizer
class SimpleTokenizer(AbstractTokenizer):
class SimpleTokenizer(Tokenizer):
"""
Simple tokenizer using any space or punctuation sign as separator
"""
......
import collections
class Tokenizer:
"""
Basic Tokenizer
"""
def get_tokens(self, document: 'gogole.document.Document'):
raise Exception("method get_all_tokens not implemented")
def get_counted_tokens(self, document: 'Document'):
return collections.Counter(self.get_tokens(document))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment