Commit d8b1ebce authored by Dos Santos David's avatar Dos Santos David

plot zipf law

parent c0f8adf6
from collections import Counter
import math
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law from gogole.utils import heap_law, plot_bar
COMMANDS = ['all', 'count_tokens', 'heap_law'] COMMANDS = ['all', 'count_tokens', 'heap_law', 'zipf_law']
def run(collection, args): def run(collection, args):
...@@ -54,3 +58,24 @@ def run(collection, args): ...@@ -54,3 +58,24 @@ def run(collection, args):
print("k: \t{0:.3g}".format(k)) print("k: \t{0:.3g}".format(k))
print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000))) print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000)))
if 'zipf_law' in commands:
c = Counter(all_tokens)
heights = [rank for _, rank in c.most_common()]
x = list(range(1, len(heights)+1))
plot_bar(
x,
heights,
filename='zipf_law.png',
xlabel="rank",
ylabel="frequency"
)
plot_bar(
list(map(math.log10, x)),
list(map(math.log10, heights)),
filename='zipf_law_logs.png',
xlabel='log10(rank)',
ylabel='log10(frequency)'
)
from gogole.utils.timeit import timeit from gogole.utils.timeit import timeit
from gogole.utils.plot import plot_bar
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def plot_bar(x, heights, linewidth=2, filename='image.png', xlabel=None, ylabel=None):
plt.plot(x, heights, linewidth=linewidth)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.savefig(filename)
plt.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment