Commit 15d4923a authored by Andreas Mueller's avatar Andreas Mueller Committed by GitHub

a whole lot of changes (#179)

* trying to fix plurals

remove bigram example as all examples now do bigrams

add test for tokenization, let ``words_`` be a dict

add colormaps, always try horizontal/vertical if other doesn't fit!

remove relative_scaling form simple.py

add matplotlib to dependencies

* fix setting of self.words_, special case for a single word

* add matplotlib to travis script

* set matplotlib backend in tests

* hack for old matplotlib
parent b648b958
......@@ -29,7 +29,7 @@ if [[ "$DISTRIB" == "conda" ]]; then
# Configure the conda environment and put it in the path using the
# provided versions
conda create -n testenv --yes python=$PYTHON_VERSION pip nose mock \
numpy=$NUMPY_VERSION
numpy=$NUMPY_VERSION matplotlib
source activate testenv
......@@ -37,7 +37,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
# Use standard ubuntu packages in their default version
virtualenv --system-site-packages testvenv
source testvenv/bin/activate
pip install nose mock
pip install nose mock matplotlib
fi
pip install -r requirements.txt
......
#!/usr/bin/env python
"""
Using bigrams and from_frequencies
==================================
We are using a custom tokenizer (here implemented from scratch, it's recommended to
use nltk, spacy or scikit-learn instead), to allow the inclusion of word-pairs
(bigrams, 2-grams) into the word cloud.
The ``from_frequencies`` method allows generating wordclouds from a list or
array of ``(word, frequency)`` tuples, where ``word`` can be any string, and
``frequency`` can be any int or float.
We are using the likelihood ratio score developed by Dunning to find "collocations",
which are phrases made up of two or more words (we only consider two here).
If the chance that a bigram is a collocation is high, we discount the appearances
of the single words -- otherwise they would always be at least as big as the bigram.
"""
import numpy as np
from PIL import Image
from os import path
import matplotlib.pyplot as plt
import random
from itertools import tee
from collections import defaultdict
import re
from math import log
from wordcloud import WordCloud, STOPWORDS
# dunning's likelihood ratio with notation from
# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
def l(k, n, x):
return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)
def score(bigram, counts, n_words):
N = n_words
c12 = counts[bigram]
c1 = counts[bigram[0]]
c2 = counts[bigram[1]]
p = c2 / N
p1 = c12 / c1
p2 = (c2 - c12) / (N - c1)
score = l(c12, c1, p) + l(c2 - c12, N - c1, p) - l(c12, c1, p1) - l(c2 - c12, N - c1, p2)
return -2 * score
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
def pairwise(iterable):
# from itertool recipies
# is -> (s0,s1), (s1,s2), (s2, s3), ...
a, b = tee(iterable)
next(b, None)
return zip(a, b)
def unigrams_and_bigrams(text, stopwords=None):
stopwords = [s.lower() for s in stopwords] if stopwords is not None else []
words = re.findall(r"\w[\w']+", text)
# remove stopwords
words = [word for word in words if word.lower() not in stopwords]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word for word in words]
# fix for movie-script upper case names
words = [word if not word.isupper() else word.title() for word in words]
n_words = len(words)
# make tuples of two words following each other
bigrams = list(pairwise(words))
counts_unigrams = defaultdict(int)
counts_bigrams = defaultdict(int)
for word in words:
counts_unigrams[word] += 1
for bigram in bigrams:
# join tuples by a space
counts_bigrams[bigram] += 1
counts_all = {}
counts_all.update(counts_unigrams)
counts_all.update(counts_bigrams)
# decount words inside bigrams
for bigram in counts_bigrams.keys():
# collocation detection (30 is arbitrary):
if score(bigram, counts_all, n_words) > 30:
counts_unigrams[bigram[0]] -= counts_bigrams[bigram]
counts_unigrams[bigram[1]] -= counts_bigrams[bigram]
# add joined bigram into unigrams
counts_unigrams[' '.join(bigram)] = counts_bigrams[bigram]
return counts_unigrams
d = path.dirname(__file__)
# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/star%20wars/storm-trooper.gif
mask = np.array(Image.open(path.join(d, "stormtrooper_mask.png")))
# movie script of "a new hope"
# http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html
# May the lawyers deem this fair use.
text = open("a_new_hope.txt").read()
# preprocessing the text a little bit
text = text.replace("INT", "")
text = text.replace("EXT", "")
wc = WordCloud(max_words=1000, mask=mask, margin=10,
color_func=grey_color_func, random_state=3)
# from_freqencies ignores "stopwords" so we have to do it ourselves
wc.generate_from_frequencies(unigrams_and_bigrams(text, STOPWORDS).items())
plt.imshow(wc)
wc.to_file("a_new_hope_bigrams.png")
plt.axis("off")
plt.show()
......@@ -22,8 +22,8 @@ import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
# take relative word frequencies into account, lower max_font_size
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
......
......@@ -9,7 +9,7 @@ setup(
url='https://github.com/amueller/word_cloud',
description='A little word cloud generator',
license='MIT',
install_requires=['numpy', 'pillow'],
install_requires=['numpy', 'pillow', 'matplotlib'],
ext_modules=[Extension("wordcloud.query_integral_image",
["wordcloud/query_integral_image.c"])],
scripts=['wordcloud/wordcloud_cli.py'],
......
from wordcloud import WordCloud, get_single_color_func
import numpy as np
from random import Random
from nose.tools import assert_equal, assert_greater, assert_true, assert_raises
from nose.tools import (assert_equal, assert_greater, assert_true,
assert_raises, assert_in, assert_not_in)
from numpy.testing import assert_array_equal
from PIL import Image
from tempfile import NamedTemporaryFile
import matplotlib
matplotlib.use('Agg')
THIS = """The Zen of Python, by Tim Peters
......@@ -41,6 +45,20 @@ def test_collocations():
assert_greater(len(wc2.words_), len(wc.words_))
def test_plurals_numbers():
text = THIS + "\n" + "1 idea 2 ideas three ideas although many Ideas"
wc = WordCloud(stopwords=[]).generate(text)
# not capitalized usually
assert_not_in("Ideas", wc.words_)
# plural removed
assert_not_in("ideas", wc.words_)
# usually capitalized
assert_not_in("although", wc.words_)
assert_in("idea", wc.words_)
assert_in("Although", wc.words_)
assert_in("better than", wc.words_)
def test_default():
# test that default word cloud creation and conversions work
wc = WordCloud(max_words=50)
......@@ -101,7 +119,7 @@ def test_check_errors():
def test_recolor():
wc = WordCloud(max_words=50)
wc = WordCloud(max_words=50, colormap="jet")
wc.generate(THIS)
array_before = wc.to_array()
wc.recolor()
......@@ -189,11 +207,9 @@ def test_process_text():
def test_generate_from_frequencies():
# test that generate_from_frequencies() takes input argument of class
# 'dict_items'
# test that generate_from_frequencies() takes input argument dicts
wc = WordCloud(max_words=50)
words = wc.process_text(THIS)
items = words.items()
result = wc.generate_from_frequencies(items)
result = wc.generate_from_frequencies(words)
assert_true(isinstance(result, WordCloud))
......@@ -8,6 +8,8 @@ from wordcloud import wordcloud_cli as cli
from mock import patch
from nose.tools import assert_equal, assert_greater, assert_true, assert_in, assert_not_in
import matplotlib
matplotlib.use('Agg')
temp = NamedTemporaryFile()
ArgOption = namedtuple('ArgOption', ['cli_name', 'init_name', 'pass_value', 'fail_value'])
......
......@@ -87,9 +87,6 @@ def process_tokens(words):
# counting frequency of each capitalization
d = defaultdict(dict)
for word in words:
if word.isdigit():
continue
word_lower = word.lower()
# get dict of cases for word_lower
case_dict = d[word_lower]
......@@ -97,6 +94,7 @@ def process_tokens(words):
case_dict[word] = case_dict.get(word, 0) + 1
# merge plurals into the singular count (simple cases only)
merged_plurals = {}
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
......@@ -107,6 +105,7 @@ def process_tokens(words):
singular = word[:-1]
dict_singular[singular] = (dict_singular.get(singular, 0)
+ count)
merged_plurals[key] = key_singular
del d[key]
fused_cases = {}
standard_cases = {}
......@@ -116,4 +115,7 @@ def process_tokens(words):
first = max(case_dict.items(), key=item1)[0]
fused_cases[first] = sum(case_dict.values())
standard_cases[word_lower] = first
# add plurals to fused cases:
for plural, singular in merged_plurals.items():
standard_cases[plural] = standard_cases[singular.lower()]
return fused_cases, standard_cases
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment