Commit 1c291578 authored by Andreas Mueller's avatar Andreas Mueller
Browse files

refactor everything into class

parent ff3ee0a3
* command line interface
* easy access to image, numpy array
* html export
* good notebook interface
* recoloring support
* by default differnt color schemes
* examples
* unit tests
* website
* docstrings
* deterministic functionality
* long functions?
* no dependency on sklearn any more?
# Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
# (c) 2012
# Modified by: Paul Nechifor <paul@nechifor.net>
#
# License: MIT
from .wordcloud import WordCloud
import random
import os
import re
import numpy as np
from operator import itemgetter
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from query_integral_image import query_integral_image
item1 = itemgetter(1)
FONT_PATH = "/usr/share/fonts/truetype/droid/DroidSansMono.ttf"
STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__),
'stopwords')).read().split('\n')])
def fit_words(words, font_path=None, width=400, height=200,
margin=5, ranks_only=False, prefer_horiz=0.90, mask=None):
"""Generate the positions for words.
Parameters
----------
words : array of tuples
A tuple contains the word and its frequency.
font_path : string
Font path to the font that will be used (OTF or TTF).
Defaults to DroidSansMono path, but you might not have it.
width : int (default=400)
Width of the canvas.
height : int (default=200)
Height of the canvas.
ranks_only : boolean (default=False)
Only use the rank of the words, not the actual counts.
prefer_horiz : float (default=0.90)
The ratio of times to try horizontal fitting as opposed to vertical.
Notes
-----
Larger canvases with make the code significantly slower. If you need a large
word cloud, run this function with a lower canvas size, and draw it with a
larger scale.
In the current form it actually just uses the rank of the counts, i.e. the
relative differences don't matter. Play with setting the font_size in the
main loop for different styles.
"""
if len(words) <= 0:
print("We need at least 1 word to plot a word cloud, got %d."
% len(words))
if font_path is None:
font_path = FONT_PATH
if not os.path.exists(font_path):
raise ValueError("The font %s does not exist." % font_path)
if mask is not None:
width = mask.shape[1]
height = mask.shape[0]
# the order of the cumsum's is important for speed ?!
integral = np.cumsum(np.cumsum(mask, axis=1), axis=0).astype(np.uint32)
else:
integral = np.zeros((height, width), dtype=np.uint32)
# create image
img_grey = Image.new("L", (width, height))
draw = ImageDraw.Draw(img_grey)
img_array = np.asarray(img_grey)
font_sizes, positions, orientations = [], [], []
# intitiallize font size "large enough"
font_size = height
# start drawing grey image
for word, count in words:
# alternative way to set the font size
if not ranks_only:
font_size = min(font_size, int(100 * np.log(count + 100)))
while True:
# try to find a position
font = ImageFont.truetype(font_path, font_size)
# transpose font optionally
if random.random() < prefer_horiz:
orientation = None
else:
orientation = Image.ROTATE_90
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
draw.setfont(transposed_font)
# get size of resulting text
box_size = draw.textsize(word)
# find possible places using integral image:
result = query_integral_image(integral, box_size[1] + margin,
box_size[0] + margin)
if result is not None or font_size == 0:
break
# if we didn't find a place, make font smaller
font_size -= 1
if font_size == 0:
# we were unable to draw any more
break
x, y = np.array(result) + margin // 2
# actually draw the text
draw.text((y, x), word, fill="white")
positions.append((x, y))
orientations.append(orientation)
font_sizes.append(font_size)
# recompute integral image
img_array = np.asarray(img_grey) + mask
# recompute bottom right
# the order of the cumsum's is important for speed ?!
partial_integral = np.cumsum(np.cumsum(img_array[x:, y:], axis=1),
axis=0)
# paste recomputed part into old image
# if x or y is zero it is a bit annoying
if x > 0:
if y > 0:
partial_integral += (integral[x - 1, y:]
- integral[x - 1, y - 1])
else:
partial_integral += integral[x - 1, y:]
if y > 0:
partial_integral += integral[x:, y - 1][:, np.newaxis]
integral[x:, y:] = partial_integral
return zip(words, font_sizes, positions, orientations)
def random_color_func(word, font_size, position, orientation):
return "hsl(%d, 80%%, 50%%)" % random.randint(0, 255)
def draw(elements, file_name, font_path=None, width=400, height=200, scale=1,
color_func=random_color_func):
if font_path is None:
font_path = FONT_PATH
img = Image.new("RGB", (width * scale, height * scale))
draw = ImageDraw.Draw(img)
for (word, count), font_size, position, orientation in elements:
font = ImageFont.truetype(font_path, font_size * scale)
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
draw.setfont(transposed_font)
color = color_func(word, font_size, position, orientation)
pos = (position[1] * scale, position[0] * scale)
draw.text(pos, word, fill=color)
img.save(file_name)
def process_text(text, max_features=200, stopwords=None):
"""Splits a long text into words, eliminates the stopwords and returns
(words, counts) which is necessary for make_wordcloud().
Parameters
----------
text : string
The text to be processed.
max_features : number (default=200)
The maximum number of words.
stopwords : set of strings
The words that will be eliminated.
Notes
-----
There are better ways to do word tokenization, but I don't want to include
all those things.
"""
if stopwords is None:
stopwords = STOPWORDS
d = {}
flags = re.UNICODE if type(text) is unicode else 0
for word in re.findall(r"\w[\w']*", text, flags=flags):
if word.isdigit():
continue
word_lower = word.lower()
if word_lower in stopwords:
continue
# Look in lowercase dict.
if word_lower in d:
d2 = d[word_lower]
else:
d2 = {}
d[word_lower] = d2
# Look in any case dict.
d2[word] = d2.get(word, 0) + 1
d3 = {}
for d2 in d.values():
# Get the most popular case.
first = max(d2.iteritems(), key=item1)[0]
d3[first] = sum(d2.values())
# merge plurals into the singular count (simple cases only)
for key in d3.keys():
if key.endswith('s'):
key_singular = key[:-1]
if key_singular in d3:
val_plural = d3[key]
val_singular = d3[key_singular]
d3[key_singular] = val_singular + val_plural
del d3[key]
words = sorted(d3.iteritems(), key=item1, reverse=True)
words = words[:max_features]
maximum = float(max(d3.values()))
for i, (word, count) in enumerate(words):
words[i] = word, count/maximum
return words
__all__ = ['WordCloud']
# Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
# (c) 2012
# Modified by: Paul Nechifor <paul@nechifor.net>
#
# License: MIT
import random
import os
import re
import numpy as np
from operator import itemgetter
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from query_integral_image import query_integral_image
item1 = itemgetter(1)
FONT_PATH = "/usr/share/fonts/truetype/droid/DroidSansMono.ttf"
STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__),
'stopwords')).read().split('\n')])
def random_color_func(word, font_size, position, orientation, random_state=None):
if random_state is None:
random_state = random.Random()
return "hsl(%d, 80%%, 50%%)" % random_state.randint(0, 255)
class WordCloud(object):
"""Word cloud object for generating and drawing.
Parameters
----------
font_path : string
Font path to the font that will be used (OTF or TTF).
Defaults to DroidSansMono path, but you might not have it.
width : int (default=400)
Width of the canvas.
height : int (default=200)
Height of the canvas.
ranks_only : boolean (default=False)
Only use the rank of the words, not the actual counts.
prefer_horiz : float (default=0.90)
The ratio of times to try horizontal fitting as opposed to vertical.
mask : nd-array or None (default=None)
If not None, gives a binary mask on where to draw words. In this case,
width and height will be ignored and the shape of mask will be used
instead.
max_features : number (default=200)
The maximum number of words.
stopwords : set of strings
The words that will be eliminated.
Attributes
----------
words_ : list of tuples (string, float)
Word tokens with associated frequency.
layout_ : list of tuples (string, int, (int, int), int, color))
Encodes the fitted word cloud. Encodes for each word the string, font
size, position, orientation and color.
"""
def __init__(self, font_path=None, width=400, height=200, margin=5,
rank_only=False, prefer_horizontal=0.9, mask=None, scale=1,
color_func=random_color_func, max_features=200, stopwords=None):
if stopwords is None:
stopwords = STOPWORDS
if font_path is None:
font_path = FONT_PATH
self.font_path = font_path
self.width = width
self.height = height
self.margin = margin
self.rank_only = rank_only
self.prefer_horizontal = prefer_horizontal
self.mask = mask
self.scale = scale
self.color_func = color_func
self.max_features = max_features
self.stopword = stopwords
def fit_words(self, words):
"""Generate the positions for words.
Parameters
----------
words : array of tuples
A tuple contains the word and its frequency.
Returns
-------
layout_ : list of tuples (string, int, (int, int), int, color))
Encodes the fitted word cloud. Encodes for each word the string, font
size, position, orientation and color.
Notes
-----
Larger canvases with make the code significantly slower. If you need a large
word cloud, run this function with a lower canvas size, and draw it with a
larger scale.
In the current form it actually just uses the rank of the counts, i.e. the
relative differences don't matter. Play with setting the font_size in the
main loop for different styles.
"""
if len(words) <= 0:
print("We need at least 1 word to plot a word cloud, got %d."
% len(words))
if self.mask is not None:
width = self.mask.shape[1]
height = self.mask.shape[0]
# the order of the cumsum's is important for speed ?!
integral = np.cumsum(np.cumsum(self.mask, axis=1), axis=0).astype(np.uint32)
else:
integral = np.zeros((height, width), dtype=np.uint32)
# create image
img_grey = Image.new("L", (width, height))
draw = ImageDraw.Draw(img_grey)
img_array = np.asarray(img_grey)
font_sizes, positions, orientations, colors = [], [], [], []
# intitiallize font size "large enough"
font_size = height
# start drawing grey image
for word, count in words:
# alternative way to set the font size
if not self.ranks_only:
font_size = min(font_size, int(100 * np.log(count + 100)))
while True:
# try to find a position
font = ImageFont.truetype(self.font_path, font_size)
# transpose font optionally
if random.random() < self.prefer_horiz:
orientation = None
else:
orientation = Image.ROTATE_90
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
draw.setfont(transposed_font)
# get size of resulting text
box_size = draw.textsize(word)
# find possible places using integral image:
result = query_integral_image(integral, box_size[1] + self.margin,
box_size[0] + self.margin)
if result is not None or font_size == 0:
break
# if we didn't find a place, make font smaller
font_size -= 1
if font_size == 0:
# we were unable to draw any more
break
x, y = np.array(result) + self.margin // 2
# actually draw the text
draw.text((y, x), word, fill="white")
positions.append((x, y))
orientations.append(orientation)
font_sizes.append(font_size)
colors.append(self.color_func(word, font_size, (x, y), orientation))
# recompute integral image
img_array = np.asarray(img_grey) + self.mask
# recompute bottom right
# the order of the cumsum's is important for speed ?!
partial_integral = np.cumsum(np.cumsum(img_array[x:, y:], axis=1),
axis=0)
# paste recomputed part into old image
# if x or y is zero it is a bit annoying
if x > 0:
if y > 0:
partial_integral += (integral[x - 1, y:]
- integral[x - 1, y - 1])
else:
partial_integral += integral[x - 1, y:]
if y > 0:
partial_integral += integral[x:, y - 1][:, np.newaxis]
integral[x:, y:] = partial_integral
self.layout_ = zip(words, font_sizes, positions, orientations, colors)
return self.layout_
def process_text(self, text):
"""Splits a long text into words, eliminates the stopwords.
Parameters
----------
text : string
The text to be processed.
Returns
-------
words : list of tuples (string, float)
Word tokens with associated frequency.
Notes
-----
There are better ways to do word tokenization, but I don't want to
include all those things.
"""
d = {}
flags = re.UNICODE if type(text) is unicode else 0
for word in re.findall(r"\w[\w']*", text, flags=flags):
if word.isdigit():
continue
word_lower = word.lower()
if word_lower in self.stopwords:
continue
# Look in lowercase dict.
if word_lower in d:
d2 = d[word_lower]
else:
d2 = {}
d[word_lower] = d2
# Look in any case dict.
d2[word] = d2.get(word, 0) + 1
d3 = {}
for d2 in d.values():
# Get the most popular case.
first = max(d2.iteritems(), key=item1)[0]
d3[first] = sum(d2.values())
# merge plurals into the singular count (simple cases only)
for key in d3.keys():
if key.endswith('s'):
key_singular = key[:-1]
if key_singular in d3:
val_plural = d3[key]
val_singular = d3[key_singular]
d3[key_singular] = val_singular + val_plural
del d3[key]
words = sorted(d3.iteritems(), key=item1, reverse=True)
words = words[:self.max_features]
maximum = float(max(d3.values()))
for i, (word, count) in enumerate(words):
words[i] = word, count / maximum
self.words_ = words
return words
def generate(self, text):
"""Convenience function that calls process_text and fit_words.
Returns
-------
self
"""
self.process_text(text)
self.fit_words(self.words_)
return self
def to_image(self):
if not hasattr(self, "layout_"):
raise ValueError("WordCloud has not been calculated, call generate first.")
img = Image.new("RGB", (self.width * self.scale, self.height * self.scale))
draw = ImageDraw.Draw(img)
for (word, count), font_size, position, orientation, color in self.layout_:
font = ImageFont.truetype(self.font_path, font_size * self.scale)
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
draw.setfont(transposed_font)
pos = (position[1] * self.scale, position[0] * self.scale)
draw.text(pos, word, fill=color)
return img
def recolor(self, random_state=None, color_func=None):
"""Recolor existing layout.
Applying a new coloring is much faster than generating the whole wordcloud.
Parameters
----------
random_state : RandomState or None, default=None
If not None, a fixed random state is used.
color_func : function or None, default=None
Function to generate new color from word count, font size, position
and orientation. If None, self.color_func is used.
Returns
-------
self
"""
if color_func is None:
color_func = self.color_func
self.layout_ = [(word, font_size, position, orientation,
color_func(word, font_size, position, orientation, random_state))
for word, font_size, position, orientation, _ in self.layout_]
return self
def to_file(self, filename):
"""Export to image file.
Parameters
----------
filename : string
Location to write to.
Returns
-------
self
"""
img = self.to_image()
img.save(filename)
return self
def to_array(self):
"""Convert to numpy array.
Returns
-------
image : nd-array size (width, height, 3)
Word cloud image as numpy matrix.
"""
return np.array(self.to_image)
def __asarray__(self):
"""Convert to numpy array.
Returns
-------
image : nd-array size (width, height, 3)
Word cloud image as numpy matrix.
"""
return self.toarray()
def to_html(self):
raise NotImplementedError("FIXME!!!")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment