Commit bc8e76ef authored by Andreas Mueller's avatar Andreas Mueller

slight cleanup, seems to work well.

parent a0c8b23a
......@@ -10,8 +10,15 @@ The ``from_frequencies`` method allows generating wordclouds from a list or
array of ``(word, frequency)`` tuples, where ``word`` can be any string, and
``frequency`` can be any int or float.
We are using the likelihood ratio score developed by Dunning to find "collocations",
which are phrases made up of two or more words (we only consider two here).
If the chance that a bigram is a collocation is high, we discount the appearances
of the single words -- otherwise they would always be at least as big as the bigram.
"""
import numpy as np
from PIL import Image
from os import path
......@@ -20,10 +27,31 @@ import random
from itertools import tee
from collections import defaultdict
import re
from math import log
from wordcloud import WordCloud, STOPWORDS
# dunning's likelihood ratio with notation from
# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
def l(k, n, x):
return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)
def score(bigram, counts, n_words):
N = n_words
c12 = counts[bigram]
c1 = counts[bigram[0]]
c2 = counts[bigram[1]]
p = c2 / N
p1 = c12 / c1
p2 = (c2 - c12) / (N - c1)
score = l(c12, c1, p) + l(c2 - c12, N - c1, p) - l(c12, c1, p1) - l(c2 - c12, N - c1, p2)
return -2 * score
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
......@@ -35,6 +63,7 @@ def pairwise(iterable):
next(b, None)
return zip(a, b)
def unigrams_and_bigrams(text, stopwords=None):
stopwords = [s.lower() for s in stopwords] if stopwords is not None else []
words = re.findall(r"\w[\w']+", text)
......@@ -54,19 +83,19 @@ def unigrams_and_bigrams(text, stopwords=None):
for bigram in bigrams:
# join tuples by a space
counts_bigrams[bigram] += 1
counts_all = {}
counts_all.update(counts_unigrams)
counts_all.update(counts_bigrams)
counts_all.update(counts_bigrams)
# decount words inside bigrams
for bigram in counts_bigrams().keys():
for bigram in counts_bigrams.keys():
# collocation detection (30 is arbitrary):
if score(bigram, counts_all, n_words) > 30:
counts_unigrams[bigram[0]] -= counts_bigrams[bigram]
counts_unigrams[bigram[1]] -= counts_bigrams[bigram]
# add joined bigram into unigrams
counts_unigrams[' '.join(bigram)] = counts_bigram[bigram]
counts_unigrams[' '.join(bigram)] = counts_bigrams[bigram]
return counts_unigrams
......@@ -91,6 +120,6 @@ wc = WordCloud(max_words=1000, mask=mask, margin=10,
# from_freqencies ignores "stopwords" so we have to do it ourselves
wc.generate_from_frequencies(unigrams_and_bigrams(text, STOPWORDS).items())
plt.imshow(wc)
wc.to_file("a_new_hope.png")
wc.to_file("a_new_hope_bigrams.png")
plt.axis("off")
plt.show()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment