Commit 941d7c2a authored by Dos Santos David's avatar Dos Santos David Committed by David Dos Santos

analyze command

parent ee848f06
a
about
above
accordingly
across
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
am
among
amongst
an
and
another
any
anybody
anyhow
anyone
anything
anywhere
apart
are
around
as
aside
at
away
awfully
b
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
below
beside
besides
best
better
between
beyond
both
brief
but
by
c
can
cannot
cant
certain
co
consequently
could
d
did
do
does
doing
done
down
downwards
during
e
each
eg
eight
either
else
elsewhere
enough
et
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
except
f
far
few
fifth
first
five
for
former
formerly
forth
four
from
further
furthermore
g
get
gets
go
gone
got
h
had
hardly
has
have
having
he
hence
her
here
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
hither
how
howbeit
however
i
ie
if
immediate
in
inasmuch
inc
indeed
inner
insofar
instead
into
inward
is
it
its
itself
j
just
k
keep
kept
l
last
latter
latterly
least
less
lest
like
little
ltd
m
many
may
me
meanwhile
might
more
moreover
most
mostly
much
must
my
myself
n
namely
near
neither
never
nevertheless
new
next
nine
no
nobody
none
noone
nor
not
nothing
novel
now
nowhere
o
of
off
often
oh
old
on
once
one
ones
only
onto
or
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
own
p
particular
particularly
per
perhaps
please
plus
probably
q
que
quite
r
rather
really
relatively
respectively
right
s
said
same
second
secondly
see
seem
seemed
seeming
seems
self
selves
sensible
serious
seven
several
shall
she
should
since
six
so
some
somebody
somehow
someone
something
sometime
sometimes
somewhat
somewhere
still
sub
such
sup
t
than
that
the
their
theirs
them
themselves
then
thence
there
thereafter
thereby
therefore
therein
thereupon
these
they
third
this
thorough
thoroughly
those
though
three
through
throughout
thru
thus
to
together
too
toward
towards
twice
two
u
under
until
unto
up
upon
us
v
various
very
via
vs
viz
w
was
we
well
went
were
what
whatever
when
whence
whenever
where
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
whoever
whole
whom
whose
why
will
with
within
without
would
x
y
yet
you
your
yours
yourself
yourselves
z
zero
/*
manual
unix
programmer's
file
files
used
name
specified
value
given
return
use
following
current
using
normally
returns
returned
causes
described
contains
example
possible
useful
available
associated
would
cause
provides
taken
unless
sent
followed
indicates
currently
necessary
specify
contain
indicate
appear
different
indicated
containing
gives
placed
uses
appropriate
automatically
ignored
changes
way
usually
allows
corresponding
specifying
class Collection:
def __init__(self, documents):
"""
params:
- documents: dict id -> Document
"""
self._documents = documents
def tokensize(self):
"""
Tokenize elements
"""
return { k: v.tokenize() for k,v in self._documents.items() }
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law
class AnalyzeCommand:
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
@staticmethod
def load_documents(collection_file, parser_cls=CACMParser, limit=None):
parser = parser_cls(collection_file)
return parser.parse(limit=limit)
@classmethod
def run_analyze(cls, args):
commands = args.sub_command
if 'all' in commands:
commands = cls.ANALYZE_COMMANDS
documents = cls.load_documents(args.file)
tokenizer = SimpleTokenizer(args.stop_words_file)
tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() }
all_tokens = [token for tokens in tokens_by_document.values() for token in tokens]
if 'count_tokens' in commands or 'heap_law' in commands:
print("{:*^50}\n".format(" Count tokens "))
count_tokens = len(all_tokens)
print("Total count of tokens : \t{:,}".format(count_tokens))
vocabulary_size = len(set(all_tokens))
print("Vocabulary size: \t\t{:,}".format(vocabulary_size))
if 'heap_law' in commands:
print("\n\n{:*^50}\n".format(" Count tokens for half the collection "))
# get half the documents
median_doc_id = sorted(documents.keys())[len(documents.keys())//2]
tokens_by_document_2 = {doc_id: tokens for doc_id, tokens in tokens_by_document.items() if doc_id <= median_doc_id}
all_tokens_2 = [token for tokens in tokens_by_document_2.values() for token in tokens]
count_tokens_2 = len(all_tokens_2)
print("Total count of tokens : \t{:,}".format(count_tokens_2))
vocabulary_size_2 = len(set(all_tokens_2))
print("Vocabulary size: \t\t{:,}".format(vocabulary_size_2))
b,k = heap_law.compute_parameters(count_tokens, vocabulary_size, count_tokens_2, vocabulary_size_2)
print("\n\n{:*^50}\n".format(" Heap's law parameters estimation "))
print("b: \t{0:.3g}".format(b))
print("k: \t{0:.3g}".format(k))
print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000)))
...@@ -7,6 +7,7 @@ class Document: ...@@ -7,6 +7,7 @@ class Document:
def __init__(self, document_id): def __init__(self, document_id):
self._document_id = document_id self._document_id = document_id
self.abstract = "" self.abstract = ""
self.keywords = ""
@property @property
def document_id(self): def document_id(self):
...@@ -20,38 +21,6 @@ class Document: ...@@ -20,38 +21,6 @@ class Document:
def title(self, value: str): def title(self, value: str):
self._title = value.strip() self._title = value.strip()
@property
def published_at(self) -> str:
return self._published_at
@published_at.setter
def published_at(self, value: str) -> str:
self._published_at = value.strip()
@property
def authors(self):
return self._authors
@authors.setter
def authors(self, value):
self._authors = value.strip()
@property
def added_at(self):
return self._added_at
@added_at.setter
def added_at(self, value):
self._added_at = value.strip()
@property
def references(self):
return self._references
@references.setter
def references(self, value):
self._references = value.strip()
@property @property
def abstract(self): def abstract(self):
return self._abstract return self._abstract
...@@ -60,20 +29,23 @@ class Document: ...@@ -60,20 +29,23 @@ class Document:
def abstract(self, value): def abstract(self, value):
self._abstract = value.strip() self._abstract = value.strip()
@property
def keywords(self):
return self._keywords
@keywords.setter
def keywords(self, value):
self._keywords = value.strip()
def __str__(self): def __str__(self):
s = "id: {doc_id}\ntitle :{title}\npublished at {published_at}\n".format( return "[ID #{doc_id}] {title}\n{keywords}{abstract}".format(
doc_id=self.document_id, doc_id=self.document_id,
title=self.title.upper(), title=self.title,
published_at=self.published_at, keywords="keywords: {}\n".format(self.keywords) if self.keywords != "" else "",
abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else ""
) )
if self.abstract != "": def tokenize(self, tokenizer=SimpleTokenizer()):
s += "abstract :{abstract}\n".format(abstract=self.abstract)
return s
def tokenize(self, tokenizer=SimpleTokenizer):
""" """
Tokenize a document Tokenize a document
""" """
......
...@@ -5,13 +5,12 @@ class CACMParser: ...@@ -5,13 +5,12 @@ class CACMParser:
MARKERS = { MARKERS = {
'.I': 'document', '.I': 'document',
'.T': 'title', '.T': 'title',
'.B': 'published_at', '.W': 'abstract',
'.A': 'authors', '.K': 'keywords'
'.N': 'added_at',
'.X': 'references',
'.W': 'abstract'
} }
ALL_MARKERS = ['.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K', '.C']
DOCUMENT_MARKER = '.I' DOCUMENT_MARKER = '.I'
...@@ -22,7 +21,7 @@ class CACMParser: ...@@ -22,7 +21,7 @@ class CACMParser:
""" """
Return the marker of the line if it exists, or None. Return the marker of the line if it exists, or None.
""" """
return next((m for m in self.MARKERS.keys() if line.startswith(m)), None) return next((m for m in self.ALL_MARKERS if line.startswith(m)), None)
def parse(self, limit=None): def parse(self, limit=None):
documents = dict() documents = dict()
...@@ -39,7 +38,7 @@ class CACMParser: ...@@ -39,7 +38,7 @@ class CACMParser:
if marker == self.DOCUMENT_MARKER: if marker == self.DOCUMENT_MARKER:
# this is a new document # this is a new document
current_document_id = int(line[len(marker) + 1]) current_document_id = int(line[3:])
if limit is not None \ if limit is not None \
and current_document_id >= limit: and current_document_id >= limit:
...@@ -49,7 +48,7 @@ class CACMParser: ...@@ -49,7 +48,7 @@ class CACMParser:
elif marker is not None: elif marker is not None:
# new marker # new marker
if current_marker is not None: if current_marker is not None and current_marker in self.MARKERS.keys():
setattr( setattr(
documents[current_document_id], documents[current_document_id],
self.MARKERS[current_marker], self.MARKERS[current_marker],
......
...@@ -7,17 +7,47 @@ class SimpleTokenizer(AbstractTokenizer): ...@@ -7,17 +7,47 @@ class SimpleTokenizer(AbstractTokenizer):
Simple tokenizer using any space or punctuation sign as separator Simple tokenizer using any space or punctuation sign as separator
""" """
SEPARATORS = [" ", ".", ",", "!", "?", ":", ";"] SEPARATORS = [" ", ".", ",", "!", "?", ":", ";", "\n", "(", ")"]
@staticmethod def __init__(self, stop_words_filename=None):
def get_tokens(document: 'Document'): self.stop_words_filename = stop_words_filename
self._stop_words = set()
if stop_words_filename is not None:
self.load_stop_words()
def load_stop_words(self):
with open(self.stop_words_filename, 'r') as f:
self._stop_words = set([word.strip() for word in f.readlines()])
def get_tokens(self, document: 'gogole.document.Document'):
tokens = [ tokens = [
document.title, document.title,
document.abstract,
document.keywords,
] ]
for sep in self.SEPARATORS: #
tokens = chain(*[t.sep(sep) for t in tokens]) # 1/ split by any separator
#
words = self._split_strings(tokens)
#
# 2/ use lower case
#
words = [x.lower() for x in words]
#
# 3/ filter words using to a stop-word list
#
filtered_words = [word for word in words if word not in self._stop_words]
return filtered_words
tokens = [t for t in tokens if t != ""] def _split_strings(self, tokens):
for separator in self.SEPARATORS:
tokens = chain(*[t.split(separator) for t in tokens])
return tokens return [t for t in tokens if t != ""]
import math
def compute_parameters(token_size1, vocabulary_size1, token_size2, vocabulary_size2):
b = math.log(vocabulary_size1/vocabulary_size2) / math.log(token_size1/token_size2)
k = vocabulary_size1 / (token_size1 ** b)
return (b,k)
def estimate_vocabulary_size(b, k, token_size):
return round(k * (token_size ** b))
import argparse import argparse
from gogole.parser.cacm_parser import CACMParser from gogole.commands.analyze import AnalyzeCommand
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", help="input collection file")
args = parser.parse_args()
parser = CACMParser(args.file)
documents = parser.parse(limit=10)
print("\n\n".join(str(d) for d in documents.values())) def main():
argsparser = argparse.ArgumentParser(prog="gogole")
# top-level parser
# collection information
argsparser.add_argument("-f", "--file", help="input collection file")
argsparser.add_argument("--stop-words-file", help="stop words list filename")
subparsers = argsparser.add_subparsers()
argsparser_analyze = subparsers.add_parser(
'analyze',
description="Run an analyze like tokens count or find heap's law parameters",
help="additional help for analyze",