Commit 15d4923a authored by Andreas Mueller's avatar Andreas Mueller Committed by GitHub

a whole lot of changes (#179)

* trying to fix plurals

remove bigram example as all examples now do bigrams

add test for tokenization, let ``words_`` be a dict

add colormaps, always try horizontal/vertical if other doesn't fit!

remove relative_scaling form simple.py

add matplotlib to dependencies

* fix setting of self.words_, special case for a single word

* add matplotlib to travis script

* set matplotlib backend in tests

* hack for old matplotlib
parent b648b958
......@@ -29,7 +29,7 @@ if [[ "$DISTRIB" == "conda" ]]; then
# Configure the conda environment and put it in the path using the
# provided versions
conda create -n testenv --yes python=$PYTHON_VERSION pip nose mock \
numpy=$NUMPY_VERSION
numpy=$NUMPY_VERSION matplotlib
source activate testenv
......@@ -37,7 +37,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
# Use standard ubuntu packages in their default version
virtualenv --system-site-packages testvenv
source testvenv/bin/activate
pip install nose mock
pip install nose mock matplotlib
fi
pip install -r requirements.txt
......
#!/usr/bin/env python
"""
Using bigrams and from_frequencies
==================================
We are using a custom tokenizer (here implemented from scratch, it's recommended to
use nltk, spacy or scikit-learn instead), to allow the inclusion of word-pairs
(bigrams, 2-grams) into the word cloud.
The ``from_frequencies`` method allows generating wordclouds from a list or
array of ``(word, frequency)`` tuples, where ``word`` can be any string, and
``frequency`` can be any int or float.
We are using the likelihood ratio score developed by Dunning to find "collocations",
which are phrases made up of two or more words (we only consider two here).
If the chance that a bigram is a collocation is high, we discount the appearances
of the single words -- otherwise they would always be at least as big as the bigram.
"""
import numpy as np
from PIL import Image
from os import path
import matplotlib.pyplot as plt
import random
from itertools import tee
from collections import defaultdict
import re
from math import log
from wordcloud import WordCloud, STOPWORDS
# dunning's likelihood ratio with notation from
# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
def l(k, n, x):
return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)
def score(bigram, counts, n_words):
N = n_words
c12 = counts[bigram]
c1 = counts[bigram[0]]
c2 = counts[bigram[1]]
p = c2 / N
p1 = c12 / c1
p2 = (c2 - c12) / (N - c1)
score = l(c12, c1, p) + l(c2 - c12, N - c1, p) - l(c12, c1, p1) - l(c2 - c12, N - c1, p2)
return -2 * score
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
def pairwise(iterable):
# from itertool recipies
# is -> (s0,s1), (s1,s2), (s2, s3), ...
a, b = tee(iterable)
next(b, None)
return zip(a, b)
def unigrams_and_bigrams(text, stopwords=None):
stopwords = [s.lower() for s in stopwords] if stopwords is not None else []
words = re.findall(r"\w[\w']+", text)
# remove stopwords
words = [word for word in words if word.lower() not in stopwords]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word for word in words]
# fix for movie-script upper case names
words = [word if not word.isupper() else word.title() for word in words]
n_words = len(words)
# make tuples of two words following each other
bigrams = list(pairwise(words))
counts_unigrams = defaultdict(int)
counts_bigrams = defaultdict(int)
for word in words:
counts_unigrams[word] += 1
for bigram in bigrams:
# join tuples by a space
counts_bigrams[bigram] += 1
counts_all = {}
counts_all.update(counts_unigrams)
counts_all.update(counts_bigrams)
# decount words inside bigrams
for bigram in counts_bigrams.keys():
# collocation detection (30 is arbitrary):
if score(bigram, counts_all, n_words) > 30:
counts_unigrams[bigram[0]] -= counts_bigrams[bigram]
counts_unigrams[bigram[1]] -= counts_bigrams[bigram]
# add joined bigram into unigrams
counts_unigrams[' '.join(bigram)] = counts_bigrams[bigram]
return counts_unigrams
d = path.dirname(__file__)
# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/star%20wars/storm-trooper.gif
mask = np.array(Image.open(path.join(d, "stormtrooper_mask.png")))
# movie script of "a new hope"
# http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html
# May the lawyers deem this fair use.
text = open("a_new_hope.txt").read()
# preprocessing the text a little bit
text = text.replace("INT", "")
text = text.replace("EXT", "")
wc = WordCloud(max_words=1000, mask=mask, margin=10,
color_func=grey_color_func, random_state=3)
# from_freqencies ignores "stopwords" so we have to do it ourselves
wc.generate_from_frequencies(unigrams_and_bigrams(text, STOPWORDS).items())
plt.imshow(wc)
wc.to_file("a_new_hope_bigrams.png")
plt.axis("off")
plt.show()
......@@ -22,8 +22,8 @@ import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
# take relative word frequencies into account, lower max_font_size
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
......
......@@ -9,7 +9,7 @@ setup(
url='https://github.com/amueller/word_cloud',
description='A little word cloud generator',
license='MIT',
install_requires=['numpy', 'pillow'],
install_requires=['numpy', 'pillow', 'matplotlib'],
ext_modules=[Extension("wordcloud.query_integral_image",
["wordcloud/query_integral_image.c"])],
scripts=['wordcloud/wordcloud_cli.py'],
......
from wordcloud import WordCloud, get_single_color_func
import numpy as np
from random import Random
from nose.tools import assert_equal, assert_greater, assert_true, assert_raises
from nose.tools import (assert_equal, assert_greater, assert_true,
assert_raises, assert_in, assert_not_in)
from numpy.testing import assert_array_equal
from PIL import Image
from tempfile import NamedTemporaryFile
import matplotlib
matplotlib.use('Agg')
THIS = """The Zen of Python, by Tim Peters
......@@ -41,6 +45,20 @@ def test_collocations():
assert_greater(len(wc2.words_), len(wc.words_))
def test_plurals_numbers():
text = THIS + "\n" + "1 idea 2 ideas three ideas although many Ideas"
wc = WordCloud(stopwords=[]).generate(text)
# not capitalized usually
assert_not_in("Ideas", wc.words_)
# plural removed
assert_not_in("ideas", wc.words_)
# usually capitalized
assert_not_in("although", wc.words_)
assert_in("idea", wc.words_)
assert_in("Although", wc.words_)
assert_in("better than", wc.words_)
def test_default():
# test that default word cloud creation and conversions work
wc = WordCloud(max_words=50)
......@@ -101,7 +119,7 @@ def test_check_errors():
def test_recolor():
wc = WordCloud(max_words=50)
wc = WordCloud(max_words=50, colormap="jet")
wc.generate(THIS)
array_before = wc.to_array()
wc.recolor()
......@@ -189,11 +207,9 @@ def test_process_text():
def test_generate_from_frequencies():
# test that generate_from_frequencies() takes input argument of class
# 'dict_items'
# test that generate_from_frequencies() takes input argument dicts
wc = WordCloud(max_words=50)
words = wc.process_text(THIS)
items = words.items()
result = wc.generate_from_frequencies(items)
result = wc.generate_from_frequencies(words)
assert_true(isinstance(result, WordCloud))
......@@ -8,6 +8,8 @@ from wordcloud import wordcloud_cli as cli
from mock import patch
from nose.tools import assert_equal, assert_greater, assert_true, assert_in, assert_not_in
import matplotlib
matplotlib.use('Agg')
temp = NamedTemporaryFile()
ArgOption = namedtuple('ArgOption', ['cli_name', 'init_name', 'pass_value', 'fail_value'])
......
......@@ -87,9 +87,6 @@ def process_tokens(words):
# counting frequency of each capitalization
d = defaultdict(dict)
for word in words:
if word.isdigit():
continue
word_lower = word.lower()
# get dict of cases for word_lower
case_dict = d[word_lower]
......@@ -97,6 +94,7 @@ def process_tokens(words):
case_dict[word] = case_dict.get(word, 0) + 1
# merge plurals into the singular count (simple cases only)
merged_plurals = {}
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
......@@ -107,6 +105,7 @@ def process_tokens(words):
singular = word[:-1]
dict_singular[singular] = (dict_singular.get(singular, 0)
+ count)
merged_plurals[key] = key_singular
del d[key]
fused_cases = {}
standard_cases = {}
......@@ -116,4 +115,7 @@ def process_tokens(words):
first = max(case_dict.items(), key=item1)[0]
fused_cases[first] = sum(case_dict.values())
standard_cases[word_lower] = first
# add plurals to fused cases:
for plural, singular in merged_plurals.items():
standard_cases[plural] = standard_cases[singular.lower()]
return fused_cases, standard_cases
......@@ -5,6 +5,8 @@
#
# License: MIT
from __future__ import division
import warnings
from random import Random
import os
......@@ -83,6 +85,31 @@ def random_color_func(word=None, font_size=None, position=None,
return "hsl(%d, 80%%, 50%%)" % random_state.randint(0, 255)
class colormap_color_func(object):
"""Color func created from matplotlib colormap.
Parameters
----------
colormap : string or matplotlib colormap
Colormap to sample from
Example
-------
>>> WordCloud(color_func=colormap_color_func("magma"))
"""
def __init__(self, colormap):
import matplotlib.pyplot as plt
self.colormap = plt.cm.get_cmap(colormap)
def __call__(self, word, font_size, position, orientation,
random_state=None, **kwargs):
if random_state is None:
random_state = Random()
r, g, b, _ = 255 * np.array(self.colormap(random_state.uniform(0, 1)))
return "rgb({:.0f}, {:.0f}, {:.0f})".format(r, g, b)
def get_single_color_func(color):
"""Create a color function which returns a single hue and saturation with.
different values (HSV). Accepted values are color strings as usable by
......@@ -186,6 +213,12 @@ class WordCloud(object):
.. versionchanged: 2.0
Default is now 0.5.
color_func : callable, default=None
Callable with parameters word, font_size, position, orientation,
font_path, random_state that returns a PIL color for each word.
Overwrites "colormap".
See colormap for specifying a matplotlib colormap instead.
regexp : string or None (optional)
Regular expression to split the input text into tokens in process_text.
If None is specified, ``r"\w[\w']+"`` is used.
......@@ -193,11 +226,22 @@ class WordCloud(object):
collocations : bool, default=True
Whether to include collocations (bigrams) of two words.
.. versionadded: 2.0
colormap : string or matplotlib colormap, default="viridis"
Matplotlib colormap to randomly draw colors from for each word.
Ignored if "color_func" is specified.
.. versionadded: 2.0
Attributes
----------
``words_``: list of tuples (string, float)
``words_`` : dict of string to float
Word tokens with associated frequency.
.. versionchanged: 2.0
``words_`` is now a dictionary
``layout_`` : list of tuples (string, int, (int, int), int, color))
Encodes the fitted word cloud. Encodes for each word the string, font
size, position, orientation and color.
......@@ -213,13 +257,23 @@ class WordCloud(object):
"""
def __init__(self, font_path=None, width=400, height=200, margin=2,
ranks_only=None, prefer_horizontal=0.9, mask=None, scale=1,
color_func=random_color_func, max_words=200, min_font_size=4,
ranks_only=None, prefer_horizontal=.9, mask=None, scale=1,
color_func=None, max_words=200, min_font_size=4,
stopwords=None, random_state=None, background_color='black',
max_font_size=None, font_step=1, mode="RGB",
relative_scaling=.5, regexp=None, collocations=True):
relative_scaling=.5, regexp=None, collocations=True,
colormap=None):
if font_path is None:
font_path = FONT_PATH
if color_func is None and colormap is None:
# we need a color map
import matplotlib
version = matplotlib.__version__
if version[0] < "2" and version[2] < "5":
colormap = "hsv"
else:
colormap = "viridis"
self.colormap = colormap
self.collocations = collocations
self.font_path = font_path
self.width = width
......@@ -228,9 +282,9 @@ class WordCloud(object):
self.prefer_horizontal = prefer_horizontal
self.mask = mask
self.scale = scale
self.color_func = color_func
self.color_func = color_func or colormap_color_func(colormap)
self.max_words = max_words
self.stopwords = stopwords or STOPWORDS
self.stopwords = stopwords if stopwords is not None else STOPWORDS
self.min_font_size = min_font_size
self.font_step = font_step
self.regexp = regexp
......@@ -238,8 +292,6 @@ class WordCloud(object):
random_state = Random(random_state)
self.random_state = random_state
self.background_color = background_color
if max_font_size is None:
max_font_size = height
self.max_font_size = max_font_size
self.mode = mode
if relative_scaling < 0 or relative_scaling > 1:
......@@ -267,13 +319,16 @@ class WordCloud(object):
"""
return self.generate_from_frequencies(frequencies)
def generate_from_frequencies(self, frequencies):
def generate_from_frequencies(self, frequencies, max_font_size=None):
"""Create a word_cloud from words and frequencies.
Parameters
----------
frequencies : array of tuples
A tuple contains the word and its frequency.
frequencies : dict from string to float
A contains words and associated frequency.
max_font_size : int
Use this font-size instead of self.max_font_size
Returns
-------
......@@ -281,7 +336,7 @@ class WordCloud(object):
"""
# make sure frequencies are sorted and normalized
frequencies = sorted(frequencies, key=item1, reverse=True)
frequencies = sorted(frequencies.items(), key=item1, reverse=True)
frequencies = frequencies[:self.max_words]
# largest entry will be 1
max_frequency = float(frequencies[0][1])
......@@ -289,8 +344,6 @@ class WordCloud(object):
frequencies = [(word, freq / max_frequency)
for word, freq in frequencies]
self.words_ = frequencies
if self.random_state is not None:
random_state = self.random_state
else:
......@@ -326,9 +379,31 @@ class WordCloud(object):
img_array = np.asarray(img_grey)
font_sizes, positions, orientations, colors = [], [], [], []
font_size = self.max_font_size
last_freq = 1.
if max_font_size is None:
# if not provided use default font_size
max_font_size = self.max_font_size
if max_font_size is None:
# figure out a good font size by trying to draw with
# just the first two words
if len(frequencies) == 1:
# we only have one word. We make it big!
font_size = self.height
else:
self.generate_from_frequencies(dict(frequencies[:2]),
max_font_size=self.height)
# find font sizes
sizes = [x[1] for x in self.layout_]
font_size = 2 * sizes[0] * sizes[1] / (sizes[0] + sizes[1])
else:
font_size = max_font_size
# we set self.words_ here because we called generate_from_frequencies
# above... hurray for good design?
self.words_ = dict(frequencies)
# start drawing grey image
for word, freq in frequencies:
# select the font size
......@@ -336,14 +411,15 @@ class WordCloud(object):
if rs != 0:
font_size = int(round((rs * (freq / float(last_freq))
+ (1 - rs)) * font_size))
if random_state.random() < self.prefer_horizontal:
orientation = None
else:
orientation = Image.ROTATE_90
tried_other_orientation = False
while True:
# try to find a position
font = ImageFont.truetype(self.font_path, font_size)
# transpose font optionally
if random_state.random() < self.prefer_horizontal:
orientation = None
else:
orientation = Image.ROTATE_90
transposed_font = ImageFont.TransposedFont(
font, orientation=orientation)
# get size of resulting text
......@@ -352,10 +428,17 @@ class WordCloud(object):
result = occupancy.sample_position(box_size[1] + self.margin,
box_size[0] + self.margin,
random_state)
if result is not None or font_size == 0:
if result is not None or font_size < self.min_font_size:
# either we found a place or font-size went too small
break
# if we didn't find a place, make font smaller
font_size -= self.font_step
if tried_other_orientation is False:
orientation = (Image.ROTATE_90 if orientation is None else
Image.ROTATE_90)
tried_other_orientation = True
else:
font_size -= self.font_step
orientation = None
if font_size < self.min_font_size:
# we were unable to draw any more
......@@ -420,6 +503,8 @@ class WordCloud(object):
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word
for word in words]
# remove numbers
words = [word for word in words if not word.isdigit()]
if self.collocations:
word_counts = unigrams_and_bigrams(words)
......@@ -442,7 +527,7 @@ class WordCloud(object):
self
"""
words = self.process_text(text)
self.generate_from_frequencies(words.items())
self.generate_from_frequencies(words)
return self
def generate(self, text):
......@@ -486,7 +571,7 @@ class WordCloud(object):
draw.text(pos, word, fill=color, font=transposed_font)
return img
def recolor(self, random_state=None, color_func=None):
def recolor(self, random_state=None, color_func=None, colormap=None):
"""Recolor existing layout.
Applying a new coloring is much faster than generating the whole
......@@ -502,6 +587,10 @@ class WordCloud(object):
Function to generate new color from word count, font size, position
and orientation. If None, self.color_func is used.
colormap : string or matplotlib colormap, default=None
Use this colormap to generate new colors. Ignored if color_func
is specified. If None, self.color_func (or self.color_map) is used.
Returns
-------
self
......@@ -511,7 +600,10 @@ class WordCloud(object):
self._check_generated()
if color_func is None:
color_func = self.color_func
if colormap is None:
color_func = self.color_func
else:
color_func = colormap_color_func(colormap)
self.layout_ = [(word_freq, font_size, position, orientation,
color_func(word=word_freq[0], font_size=font_size,
position=position, orientation=orientation,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment