Commit 02692309 authored by Paul Nechifor's avatar Paul Nechifor
Browse files

Cleaned it up a little bit.

parent 3d6b3b34
build
wordcloud/query_integral_image.c
wordcloud/query_integral_image.so
/wordcloud/query_integral_image.c
/wordcloud/query_integral_image.so
*.pyc
*~
*.png
!/examples/constitution.png
!/examples/alice.png
Copyright (c) 2012 Andreas Christian Mueller
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
word_cloud
==========
A fork of [Andreas Mueller](https://github.com/amueller)'s
[word_cloud](https://github.com/amueller/word_cloud) to make it a little more
extendable and installable as a package.
A little word cloud generator in Python. Read more about it on the [blog
post][blog-post].
Install it by running:
pip install Cython
pip install numpy
pip install PIL
## Installation
Get this package:
wget https://github.com/paul-nechifor/word_cloud/archive/master.zip
unzip master.zip
rm master.zip
cd word_cloud-master
sudo python setup.py install
cd ..
sudo rm -r word_cloud-master master.zip
Run the files in `examples/` to for a short intro.
Install it:
sudo pip install -r requirements.txt
sudo python setup
## Examples
Note that if you are not on Ubuntu, you need to adjust FONT_PATH to point to
some existing font.
Check out [examples/simple.py][simple] for a short intro. A sample output is:
![Constitution](examples/constitution.png)
Or run [examples/more.py][more] to see more options.
## Used in
### Reddit Cloud
[Reddit Cloud][reddit-cloud] is a Reddit bot which generates word clouds for
comments in submissions and user histories. You can see it being operated on
[/u/WordCloudBot2][wc2] ([top posting][wc2top]).
![A Reddit Cloud sample](http://i.imgur.com/tcbZnKW.png)
### <other>
*Send a pull request to add yours here.*
## Issues
Using Pillow instead of PIL might might get you the [`TypeError: 'int' object is
not iterable` problem][intprob] also showcased on the blog.
[blog-post]: http://peekaboo-vision.blogspot.de/2012/11/a-wordcloud-in-python.html
[simple]: examples/simple.py
[simple]: examples/more.py
[reddit-cloud]: https://github.com/paul-nechifor/reddit-cloud
[wc2]: http://www.reddit.com/user/WordCloudBot2
[wc2top]: http://www.reddit.com/user/WordCloudBot2/?sort=top
[intprob]: http://peekaboo-vision.blogspot.de/2012/11/a-wordcloud-in-python.html#bc_0_28B
#!/usr/bin/env python2
from os import path
import sys
import os
import wordcloud
text = open('alice.txt').read()
words, counts = wordcloud.process_text(text, max_features=2000)
elements = wordcloud.fit_words(words, counts, width=500, height=500)
wordcloud.draw(elements, 'alice.png', width=500, height=500, scale=2)
d = path.dirname(__file__)
# Read the whole text.
text = open(path.join(d, 'alice.txt')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text, max_features=2000)
# Compute the position of the words.
elements = wordcloud.fit_words(words, width=500, height=500)
# Draw the positioned words to a PNG file.
wordcloud.draw(elements, path.join(d, 'alice.png'), width=500, height=500,
scale=2)
#!/usr/bin/env python2
from os import path
import sys
import os
import wordcloud
text = open('constitution.txt').read()
words, counts = wordcloud.process_text(text)
elements = wordcloud.fit_words(words, counts)
wordcloud.draw(elements, 'constitution.png')
d = path.dirname(__file__)
# Read the whole text.
text = open(path.join(d, 'constitution.txt')).read()
# Separate into a list of (word, frequency).
words = wordcloud.process_text(text)
# Compute the position of the words.
elements = wordcloud.fit_words(words)
# Draw the positioned words to a PNG file.
wordcloud.draw(elements, path.join(d, 'constitution.png'))
Cython>=0.19.1
PIL>=1.1.7
numpy>=1.7.1
import os
from distutils.core import setup
from Cython.Build import cythonize
setup(
name='wordcloud',
ext_modules=cythonize('wordcloud/query_integral_image.pyx'),
version='1.0.0',
url='https://github.com/paul-nechifor/word_cloud',
license='MIT',
ext_modules=cythonize('wordcloud/query_integral_image.pyx'),
packages=['wordcloud'],
package_data={'wordcloud': ['stopwords']}
)
# Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
# (c) 2012
# Author: Paul Nechifor <paul@nechifor.net>
# Modified by: Paul Nechifor <paul@nechifor.net>
#
# License: MIT
......@@ -19,29 +19,24 @@ FONT_PATH = "/usr/share/fonts/truetype/droid/DroidSansMono.ttf"
STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__),
'stopwords')).read().split('\n')])
def fit_words(words, counts, font_path=None, width=400, height=200,
def fit_words(words, font_path=None, width=400, height=200,
margin=5, ranks_only=False, prefer_horiz=0.90):
"""Build word cloud using word counts.
"""Generate the positions for words.
Parameters
----------
words : numpy array of strings
Words that will be drawn in the image.
counts : numpy array of word counts
Word counts or weighting of words. Determines the size of the word in
the final image.
Will be normalized to lie between zero and one.
words : array of tuples
A tuple contains the word and its frequency.
font_path : string
Font path to the font that will be used.
Defaults to DroidSansMono path.
Font path to the font that will be used (OTF or TTF).
Defaults to DroidSansMono path, but you might not have it.
width : int (default=400)
Width of the word cloud image.
Width of the canvas.
height : int (default=200)
Height of the word cloud image.
Height of the canvas.
ranks_only : boolean (default=False)
Only use the rank of the words, not the actual counts.
......@@ -51,33 +46,24 @@ def fit_words(words, counts, font_path=None, width=400, height=200,
Notes
-----
Larger Images with make the code significantly slower.
If you need a large image, you can try running the algorithm at a lower
resolution and then drawing the result at the desired resolution.
In the current form it actually just uses the rank of the counts,
i.e. the relative differences don't matter.
Play with setting the font_size in the main loop vor differnt styles.
Colors are used completely at random. Currently the colors are sampled
from HSV space with a fixed S and V.
Adjusting the percentages at the very end gives differnt color ranges.
Obviously you can also set all at random - haven't tried that.
Larger canvases with make the code significantly slower. If you need a large
word cloud, run this function with a lower canvas size, and draw it with a
larger scale.
In the current form it actually just uses the rank of the counts, i.e. the
relative differences don't matter. Play with setting the font_size in the
main loop for different styles.
"""
if len(counts) <= 0:
if len(words) <= 0:
print("We need at least 1 word to plot a word cloud, got %d."
% len(counts))
% len(words))
if font_path is None:
font_path = FONT_PATH
# normalize counts
#counts = counts / float(max(counts))
# sort words by counts
#inds = np.argsort(counts)[::-1]
#counts = counts[inds]
#words = words[inds]
if not os.path.exists(font_path):
raise ValueError("The font %s does not exist." % font_path)
# create image
img_grey = Image.new("L", (width, height))
......@@ -85,10 +71,12 @@ def fit_words(words, counts, font_path=None, width=400, height=200,
integral = np.zeros((height, width), dtype=np.uint32)
img_array = np.asarray(img_grey)
font_sizes, positions, orientations = [], [], []
# intitiallize font size "large enough"
font_size = 1000
font_size = height
# start drawing grey image
for word, count in zip(words, counts):
for word, count in words:
# alternative way to set the font size
if not ranks_only:
font_size = min(font_size, int(100 * np.log(count + 100)))
......@@ -155,7 +143,7 @@ def draw(elements, file_name, font_path=None, width=400, height=200, scale=1,
img = Image.new("RGB", (width * scale, height * scale))
draw = ImageDraw.Draw(img)
for word, font_size, position, orientation in elements:
for (word, count), font_size, position, orientation in elements:
font = ImageFont.truetype(font_path, font_size * scale)
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
......@@ -165,20 +153,37 @@ def draw(elements, file_name, font_path=None, width=400, height=200, scale=1,
draw.text(pos, word, fill=color)
img.save(file_name)
def process_text(text, max_features=200, stopwords=STOPWORDS):
def process_text(text, max_features=200, stopwords=None):
"""Splits a long text into words, eliminates the stopwords and returns
(words, counts) which is necessary for make_wordcloud().
Parameters
----------
text : string
The text to be processed.
max_features : number (default=200)
The maximum number of words.
stopwords : set of strings
The words that will be eliminated.
Notes
-----
There are better ways to do word tokenization, but I don't want to include
all those things.
"""
# there are better ways to do this, but I don't want to include all those
# things
d = {}
if stopwords is None:
stopwords = STOPWORDS
d = {}
for word in re.findall(r"\w[\w']*", text):
word_lower = word.lower()
if word_lower in stopwords:
continue
# Look in all lowercase dict.
# Look in lowercase dict.
if d.has_key(word_lower):
d2 = d[word_lower]
else:
......@@ -192,21 +197,15 @@ def process_text(text, max_features=200, stopwords=STOPWORDS):
d2[word] = 1
d3 = {}
for dv in d.values():
for d2 in d.values():
# Get the most popular case.
first = sorted(dv.iteritems(), key=lambda x: x[1], reverse=True)[0][0]
d3[first] = sum(dv.values())
first = sorted(d2.iteritems(), key=lambda x: x[1], reverse=True)[0][0]
d3[first] = sum(d2.values())
sd = sorted(d3.iteritems(), key=lambda x: x[1], reverse=True)
sd = sd[:max_features]
words = sorted(d3.iteritems(), key=lambda x: x[1], reverse=True)
words = words[:max_features]
maximum = float(max(d3.values()))
for i, (word, count) in enumerate(words):
words[i] = word, count/maximum
words = []
counts = []
for word, count in sd:
words.append(word)
counts.append(count / maximum)
return words, counts
return words
......@@ -43,6 +43,7 @@ few
for
from
further
get
had
hadn't
has
......@@ -105,6 +106,7 @@ ourselves
out
over
own
r
same
shan't
she
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment