Commit 4f6a914f authored by Prot Alexandre's avatar Prot Alexandre
parents 78703046 d6dc90db
......@@ -5,9 +5,10 @@ from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class CACMCollection(Collection):
NAME = 'cacm'
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer('cacm', maxsize=None)
self._indexer = BSBIIndexer(self.NAME, maxsize=None)
self._parser = CACMParser()
......
......@@ -5,9 +5,11 @@ from gogole.parser.stanford_parser import StanfordParser
from gogole.tokenizer.no_tokenizer import NoTokenizer
class StanfordCollection(Collection):
NAME = 'stanford'
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer('stanford', maxsize=16*1024*1024)
self._indexer = BSBIIndexer(self.NAME, maxsize=16*1024*1024)
self._parser = StanfordParser()
......
from collections import Counter
import math
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law
from gogole.utils import heap_law, plot_bar
COMMANDS = ['all', 'count_tokens', 'heap_law']
COMMANDS = ['all', 'count_tokens', 'heap_law', 'zipf_law']
def run(collection, args):
......@@ -54,3 +58,24 @@ def run(collection, args):
print("k: \t{0:.3g}".format(k))
print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000)))
if 'zipf_law' in commands:
c = Counter(all_tokens)
heights = [rank for _, rank in c.most_common()]
x = list(range(1, len(heights)+1))
plot_bar(
x,
heights,
filename='graphs/{}_zipf_law.png'.format(collection.NAME),
xlabel="rank",
ylabel="frequency"
)
plot_bar(
list(map(math.log10, x)),
list(map(math.log10, heights)),
filename='graphs/{}_zipf_law_logs.png'.format(collection.NAME),
xlabel='log10(rank)',
ylabel='log10(frequency)'
)
from gogole.utils.timeit import timeit
from gogole.utils.plot import plot_bar
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def plot_bar(x, heights, linewidth=2, filename='image.png', xlabel=None, ylabel=None):
plt.plot(x, heights, linewidth=linewidth)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.savefig(filename)
plt.close()
......@@ -25,9 +25,13 @@ b: 0.509
k: 31.7
estimation of vocabulary size for 1M tokens : 36034
```
Graphes pour la loi de Zipf :
![zipf_law](/graphs/cacm_zipf_law.png)
![zipf_law_logs](/graphs/cacm_zipf_law_logs.png)
#### Collection CS276
Voici l'analyse obtenue pour la collection CS276
......@@ -35,20 +39,25 @@ Voici l'analyse obtenue pour la collection CS276
```
****************** Count tokens ******************
Total count of tokens : 17,879,253
Vocabulary size: 337,191
Total count of tokens : 25,498,340
Vocabulary size: 347,071
****** Count tokens for half the collection ******
Total count of tokens : 9,958,569
Vocabulary size: 191,499
Total count of tokens : 14,332,579
Vocabulary size: 196,989
******** Heap's law parameters estimation ********
b: 0.967
k: 0.0328
b: 0.983
k: 0.0181
estimation of vocabulary size for 1M tokens : 20755
estimation of vocabulary size for 1M tokens : 14374
```
Graphes pour la loi de Zipf :
![zipf_law](/graphs/cs276_zipf_law.png)
![zipf_law_logs](/graphs/cs276_zipf_law_logs.png)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment