Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

cacm_document.py 1.34 KB
Newer Older
Dos Santos David's avatar
Dos Santos David committed
1 2
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer

3
class CACMDocument:
Dos Santos David's avatar
Dos Santos David committed
4 5 6

    def __init__(self, document_id):
        self._document_id = document_id
Dos Santos David's avatar
Dos Santos David committed
7
        self.abstract = ""
Dos Santos David's avatar
Dos Santos David committed
8
        self.keywords = ""
Dos Santos David's avatar
Dos Santos David committed
9 10 11 12 13 14 15 16 17 18 19 20 21

    @property
    def document_id(self):
        return self._document_id

    @property
    def title(self) -> str:
        return self._title

    @title.setter
    def title(self, value: str):
        self._title = value.strip()

Dos Santos David's avatar
Dos Santos David committed
22 23 24 25 26 27 28 29
    @property
    def abstract(self):
        return self._abstract

    @abstract.setter
    def abstract(self, value):
        self._abstract = value.strip()

Dos Santos David's avatar
Dos Santos David committed
30 31 32 33 34 35 36 37
    @property
    def keywords(self):
        return self._keywords

    @keywords.setter
    def keywords(self, value):
        self._keywords = value.strip()

Dos Santos David's avatar
Dos Santos David committed
38
    def __str__(self):
Dos Santos David's avatar
Dos Santos David committed
39
        return "[ID #{doc_id}] {title}\n{keywords}{abstract}".format(
Dos Santos David's avatar
Dos Santos David committed
40
            doc_id=self.document_id,
Dos Santos David's avatar
Dos Santos David committed
41 42 43
            title=self.title,
            keywords="keywords: {}\n".format(self.keywords) if self.keywords != "" else "",
            abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else ""
Dos Santos David's avatar
Dos Santos David committed
44 45
        )

46 47 48 49 50

    def get_raw_content(self):
        return " ".join([self.title, self.abstract, self.keywords])


Dos Santos David's avatar
Dos Santos David committed
51
    def tokenize(self, tokenizer=SimpleTokenizer()):
Dos Santos David's avatar
Dos Santos David committed
52 53 54
        """
        Tokenize a document
        """
Dos Santos David's avatar
Dos Santos David committed
55
        return tokenizer.get_tokens(self)