Commit ab59b47a authored by Dos Santos David's avatar Dos Santos David Committed by David Dos Santos

init commit

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
.static_storage/
.media/
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
class Collection:
def __init__(self, documents):
"""
params:
- documents: dict id -> Document
"""
self._documents = documents
def tokensize(self):
"""
Tokenize elements
"""
return { k: v.tokenize() for k,v in self._documents.items() }
version: "2"
services:
app:
image: "python:3.6"
volumes:
- "./:/app"
working_dir: "/app"
command: "python3 main.py -f dumps/cacm.all"
import queue
class Document:
def __init__(self, document_id):
self._document_id = document_id
@property
def document_id(self):
return self._document_id
@property
def title(self) -> str:
return self._title
@title.setter
def title(self, value: str):
self._title = value.strip()
@property
def published_at(self) -> str:
return self._published_at
@published_at.setter
def published_at(self, value: str) -> str:
self._published_at = value.strip()
@property
def authors(self):
return self._authors
@authors.setter
def authors(self, value):
self._authors = value.strip()
@property
def added_at(self):
return self._added_at
@added_at.setter
def added_at(self, value):
self._added_at = value.strip()
@property
def references(self):
return self._references
@references.setter
def references(self, value):
self._references = value.strip()
def __str__(self):
s = "[id: {doc_id}] {title} (published at {published_at})".format(
doc_id=self.document_id,
title=self.title.upper(),
published_at=self.published_at
)
return s
def tokenize(self):
"""
Tokenize a document
"""
items = [ self._title, self._authors, self._references ]
tokens = []
for item in items:
tokens += self._tokenize_element(item)
return tokens
@staticmethod
def _tokenize_element(item):
"""
Tokenize an element of the document
"""
# TODO: make it better
return item.replace("!", " ").replace(".", " ").split()
This diff is collapsed.
import argparse
from document import Document
MARKERS = {
'.I': 'document',
'.T': 'title',
'.B': 'published_at',
'.A': 'authors',
'.N': 'added_at',
'.X': 'references',
}
DOCUMENT_MARKER = '.I'
def extract_documents_from_file(file):
documents = dict()
current_marker = None
current_document_id = None
buffer = ""
for line in file:
ms = [m for m in MARKERS.keys() if line.startswith(m)]
if len(ms) > 0:
assert(len(ms) == 1)
marker = ms[0]
if marker == DOCUMENT_MARKER:
# new document
current_document_id = int(line[len(marker)+1:])
if current_document_id > 100:
break
documents[current_document_id] = Document(current_document_id)
else:
if current_marker is not None:
setattr(documents[current_document_id], MARKERS[current_marker], buffer)
buffer = ""
current_marker = marker
else:
buffer += line
return documents
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", help="input collection file")
args = parser.parse_args()
documents = dict()
with open(args.file, 'r') as f:
documents = extract_documents_from_file(f)
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment