Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
keeleliin
word_cloud_est
Commits
1c291578
Commit
1c291578
authored
Sep 22, 2014
by
Andreas Mueller
Browse files
refactor everything into class
parent
ff3ee0a3
Changes
3
Hide whitespace changes
Inline
Side-by-side
wordcloud/TODO
0 → 100644
View file @
1c291578
* command line interface
* easy access to image, numpy array
* html export
* good notebook interface
* recoloring support
* by default differnt color schemes
* examples
* unit tests
* website
* docstrings
* deterministic functionality
* long functions?
* no dependency on sklearn any more?
wordcloud/__init__.py
View file @
1c291578
# Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
# (c) 2012
# Modified by: Paul Nechifor <paul@nechifor.net>
#
# License: MIT
from
.wordcloud
import
WordCloud
import
random
import
os
import
re
import
numpy
as
np
from
operator
import
itemgetter
from
PIL
import
Image
from
PIL
import
ImageDraw
from
PIL
import
ImageFont
from
query_integral_image
import
query_integral_image
item1
=
itemgetter
(
1
)
FONT_PATH
=
"/usr/share/fonts/truetype/droid/DroidSansMono.ttf"
STOPWORDS
=
set
([
x
.
strip
()
for
x
in
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'stopwords'
)).
read
().
split
(
'
\n
'
)])
def
fit_words
(
words
,
font_path
=
None
,
width
=
400
,
height
=
200
,
margin
=
5
,
ranks_only
=
False
,
prefer_horiz
=
0.90
,
mask
=
None
):
"""Generate the positions for words.
Parameters
----------
words : array of tuples
A tuple contains the word and its frequency.
font_path : string
Font path to the font that will be used (OTF or TTF).
Defaults to DroidSansMono path, but you might not have it.
width : int (default=400)
Width of the canvas.
height : int (default=200)
Height of the canvas.
ranks_only : boolean (default=False)
Only use the rank of the words, not the actual counts.
prefer_horiz : float (default=0.90)
The ratio of times to try horizontal fitting as opposed to vertical.
Notes
-----
Larger canvases with make the code significantly slower. If you need a large
word cloud, run this function with a lower canvas size, and draw it with a
larger scale.
In the current form it actually just uses the rank of the counts, i.e. the
relative differences don't matter. Play with setting the font_size in the
main loop for different styles.
"""
if
len
(
words
)
<=
0
:
print
(
"We need at least 1 word to plot a word cloud, got %d."
%
len
(
words
))
if
font_path
is
None
:
font_path
=
FONT_PATH
if
not
os
.
path
.
exists
(
font_path
):
raise
ValueError
(
"The font %s does not exist."
%
font_path
)
if
mask
is
not
None
:
width
=
mask
.
shape
[
1
]
height
=
mask
.
shape
[
0
]
# the order of the cumsum's is important for speed ?!
integral
=
np
.
cumsum
(
np
.
cumsum
(
mask
,
axis
=
1
),
axis
=
0
).
astype
(
np
.
uint32
)
else
:
integral
=
np
.
zeros
((
height
,
width
),
dtype
=
np
.
uint32
)
# create image
img_grey
=
Image
.
new
(
"L"
,
(
width
,
height
))
draw
=
ImageDraw
.
Draw
(
img_grey
)
img_array
=
np
.
asarray
(
img_grey
)
font_sizes
,
positions
,
orientations
=
[],
[],
[]
# intitiallize font size "large enough"
font_size
=
height
# start drawing grey image
for
word
,
count
in
words
:
# alternative way to set the font size
if
not
ranks_only
:
font_size
=
min
(
font_size
,
int
(
100
*
np
.
log
(
count
+
100
)))
while
True
:
# try to find a position
font
=
ImageFont
.
truetype
(
font_path
,
font_size
)
# transpose font optionally
if
random
.
random
()
<
prefer_horiz
:
orientation
=
None
else
:
orientation
=
Image
.
ROTATE_90
transposed_font
=
ImageFont
.
TransposedFont
(
font
,
orientation
=
orientation
)
draw
.
setfont
(
transposed_font
)
# get size of resulting text
box_size
=
draw
.
textsize
(
word
)
# find possible places using integral image:
result
=
query_integral_image
(
integral
,
box_size
[
1
]
+
margin
,
box_size
[
0
]
+
margin
)
if
result
is
not
None
or
font_size
==
0
:
break
# if we didn't find a place, make font smaller
font_size
-=
1
if
font_size
==
0
:
# we were unable to draw any more
break
x
,
y
=
np
.
array
(
result
)
+
margin
//
2
# actually draw the text
draw
.
text
((
y
,
x
),
word
,
fill
=
"white"
)
positions
.
append
((
x
,
y
))
orientations
.
append
(
orientation
)
font_sizes
.
append
(
font_size
)
# recompute integral image
img_array
=
np
.
asarray
(
img_grey
)
+
mask
# recompute bottom right
# the order of the cumsum's is important for speed ?!
partial_integral
=
np
.
cumsum
(
np
.
cumsum
(
img_array
[
x
:,
y
:],
axis
=
1
),
axis
=
0
)
# paste recomputed part into old image
# if x or y is zero it is a bit annoying
if
x
>
0
:
if
y
>
0
:
partial_integral
+=
(
integral
[
x
-
1
,
y
:]
-
integral
[
x
-
1
,
y
-
1
])
else
:
partial_integral
+=
integral
[
x
-
1
,
y
:]
if
y
>
0
:
partial_integral
+=
integral
[
x
:,
y
-
1
][:,
np
.
newaxis
]
integral
[
x
:,
y
:]
=
partial_integral
return
zip
(
words
,
font_sizes
,
positions
,
orientations
)
def
random_color_func
(
word
,
font_size
,
position
,
orientation
):
return
"hsl(%d, 80%%, 50%%)"
%
random
.
randint
(
0
,
255
)
def
draw
(
elements
,
file_name
,
font_path
=
None
,
width
=
400
,
height
=
200
,
scale
=
1
,
color_func
=
random_color_func
):
if
font_path
is
None
:
font_path
=
FONT_PATH
img
=
Image
.
new
(
"RGB"
,
(
width
*
scale
,
height
*
scale
))
draw
=
ImageDraw
.
Draw
(
img
)
for
(
word
,
count
),
font_size
,
position
,
orientation
in
elements
:
font
=
ImageFont
.
truetype
(
font_path
,
font_size
*
scale
)
transposed_font
=
ImageFont
.
TransposedFont
(
font
,
orientation
=
orientation
)
draw
.
setfont
(
transposed_font
)
color
=
color_func
(
word
,
font_size
,
position
,
orientation
)
pos
=
(
position
[
1
]
*
scale
,
position
[
0
]
*
scale
)
draw
.
text
(
pos
,
word
,
fill
=
color
)
img
.
save
(
file_name
)
def
process_text
(
text
,
max_features
=
200
,
stopwords
=
None
):
"""Splits a long text into words, eliminates the stopwords and returns
(words, counts) which is necessary for make_wordcloud().
Parameters
----------
text : string
The text to be processed.
max_features : number (default=200)
The maximum number of words.
stopwords : set of strings
The words that will be eliminated.
Notes
-----
There are better ways to do word tokenization, but I don't want to include
all those things.
"""
if
stopwords
is
None
:
stopwords
=
STOPWORDS
d
=
{}
flags
=
re
.
UNICODE
if
type
(
text
)
is
unicode
else
0
for
word
in
re
.
findall
(
r
"\w[\w']*"
,
text
,
flags
=
flags
):
if
word
.
isdigit
():
continue
word_lower
=
word
.
lower
()
if
word_lower
in
stopwords
:
continue
# Look in lowercase dict.
if
word_lower
in
d
:
d2
=
d
[
word_lower
]
else
:
d2
=
{}
d
[
word_lower
]
=
d2
# Look in any case dict.
d2
[
word
]
=
d2
.
get
(
word
,
0
)
+
1
d3
=
{}
for
d2
in
d
.
values
():
# Get the most popular case.
first
=
max
(
d2
.
iteritems
(),
key
=
item1
)[
0
]
d3
[
first
]
=
sum
(
d2
.
values
())
# merge plurals into the singular count (simple cases only)
for
key
in
d3
.
keys
():
if
key
.
endswith
(
's'
):
key_singular
=
key
[:
-
1
]
if
key_singular
in
d3
:
val_plural
=
d3
[
key
]
val_singular
=
d3
[
key_singular
]
d3
[
key_singular
]
=
val_singular
+
val_plural
del
d3
[
key
]
words
=
sorted
(
d3
.
iteritems
(),
key
=
item1
,
reverse
=
True
)
words
=
words
[:
max_features
]
maximum
=
float
(
max
(
d3
.
values
()))
for
i
,
(
word
,
count
)
in
enumerate
(
words
):
words
[
i
]
=
word
,
count
/
maximum
return
words
__all__
=
[
'WordCloud'
]
wordcloud/wordcloud.py
0 → 100644
View file @
1c291578
# Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
# (c) 2012
# Modified by: Paul Nechifor <paul@nechifor.net>
#
# License: MIT
import
random
import
os
import
re
import
numpy
as
np
from
operator
import
itemgetter
from
PIL
import
Image
from
PIL
import
ImageDraw
from
PIL
import
ImageFont
from
query_integral_image
import
query_integral_image
item1
=
itemgetter
(
1
)
FONT_PATH
=
"/usr/share/fonts/truetype/droid/DroidSansMono.ttf"
STOPWORDS
=
set
([
x
.
strip
()
for
x
in
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'stopwords'
)).
read
().
split
(
'
\n
'
)])
def
random_color_func
(
word
,
font_size
,
position
,
orientation
,
random_state
=
None
):
if
random_state
is
None
:
random_state
=
random
.
Random
()
return
"hsl(%d, 80%%, 50%%)"
%
random_state
.
randint
(
0
,
255
)
class
WordCloud
(
object
):
"""Word cloud object for generating and drawing.
Parameters
----------
font_path : string
Font path to the font that will be used (OTF or TTF).
Defaults to DroidSansMono path, but you might not have it.
width : int (default=400)
Width of the canvas.
height : int (default=200)
Height of the canvas.
ranks_only : boolean (default=False)
Only use the rank of the words, not the actual counts.
prefer_horiz : float (default=0.90)
The ratio of times to try horizontal fitting as opposed to vertical.
mask : nd-array or None (default=None)
If not None, gives a binary mask on where to draw words. In this case,
width and height will be ignored and the shape of mask will be used
instead.
max_features : number (default=200)
The maximum number of words.
stopwords : set of strings
The words that will be eliminated.
Attributes
----------
words_ : list of tuples (string, float)
Word tokens with associated frequency.
layout_ : list of tuples (string, int, (int, int), int, color))
Encodes the fitted word cloud. Encodes for each word the string, font
size, position, orientation and color.
"""
def
__init__
(
self
,
font_path
=
None
,
width
=
400
,
height
=
200
,
margin
=
5
,
rank_only
=
False
,
prefer_horizontal
=
0.9
,
mask
=
None
,
scale
=
1
,
color_func
=
random_color_func
,
max_features
=
200
,
stopwords
=
None
):
if
stopwords
is
None
:
stopwords
=
STOPWORDS
if
font_path
is
None
:
font_path
=
FONT_PATH
self
.
font_path
=
font_path
self
.
width
=
width
self
.
height
=
height
self
.
margin
=
margin
self
.
rank_only
=
rank_only
self
.
prefer_horizontal
=
prefer_horizontal
self
.
mask
=
mask
self
.
scale
=
scale
self
.
color_func
=
color_func
self
.
max_features
=
max_features
self
.
stopword
=
stopwords
def
fit_words
(
self
,
words
):
"""Generate the positions for words.
Parameters
----------
words : array of tuples
A tuple contains the word and its frequency.
Returns
-------
layout_ : list of tuples (string, int, (int, int), int, color))
Encodes the fitted word cloud. Encodes for each word the string, font
size, position, orientation and color.
Notes
-----
Larger canvases with make the code significantly slower. If you need a large
word cloud, run this function with a lower canvas size, and draw it with a
larger scale.
In the current form it actually just uses the rank of the counts, i.e. the
relative differences don't matter. Play with setting the font_size in the
main loop for different styles.
"""
if
len
(
words
)
<=
0
:
print
(
"We need at least 1 word to plot a word cloud, got %d."
%
len
(
words
))
if
self
.
mask
is
not
None
:
width
=
self
.
mask
.
shape
[
1
]
height
=
self
.
mask
.
shape
[
0
]
# the order of the cumsum's is important for speed ?!
integral
=
np
.
cumsum
(
np
.
cumsum
(
self
.
mask
,
axis
=
1
),
axis
=
0
).
astype
(
np
.
uint32
)
else
:
integral
=
np
.
zeros
((
height
,
width
),
dtype
=
np
.
uint32
)
# create image
img_grey
=
Image
.
new
(
"L"
,
(
width
,
height
))
draw
=
ImageDraw
.
Draw
(
img_grey
)
img_array
=
np
.
asarray
(
img_grey
)
font_sizes
,
positions
,
orientations
,
colors
=
[],
[],
[],
[]
# intitiallize font size "large enough"
font_size
=
height
# start drawing grey image
for
word
,
count
in
words
:
# alternative way to set the font size
if
not
self
.
ranks_only
:
font_size
=
min
(
font_size
,
int
(
100
*
np
.
log
(
count
+
100
)))
while
True
:
# try to find a position
font
=
ImageFont
.
truetype
(
self
.
font_path
,
font_size
)
# transpose font optionally
if
random
.
random
()
<
self
.
prefer_horiz
:
orientation
=
None
else
:
orientation
=
Image
.
ROTATE_90
transposed_font
=
ImageFont
.
TransposedFont
(
font
,
orientation
=
orientation
)
draw
.
setfont
(
transposed_font
)
# get size of resulting text
box_size
=
draw
.
textsize
(
word
)
# find possible places using integral image:
result
=
query_integral_image
(
integral
,
box_size
[
1
]
+
self
.
margin
,
box_size
[
0
]
+
self
.
margin
)
if
result
is
not
None
or
font_size
==
0
:
break
# if we didn't find a place, make font smaller
font_size
-=
1
if
font_size
==
0
:
# we were unable to draw any more
break
x
,
y
=
np
.
array
(
result
)
+
self
.
margin
//
2
# actually draw the text
draw
.
text
((
y
,
x
),
word
,
fill
=
"white"
)
positions
.
append
((
x
,
y
))
orientations
.
append
(
orientation
)
font_sizes
.
append
(
font_size
)
colors
.
append
(
self
.
color_func
(
word
,
font_size
,
(
x
,
y
),
orientation
))
# recompute integral image
img_array
=
np
.
asarray
(
img_grey
)
+
self
.
mask
# recompute bottom right
# the order of the cumsum's is important for speed ?!
partial_integral
=
np
.
cumsum
(
np
.
cumsum
(
img_array
[
x
:,
y
:],
axis
=
1
),
axis
=
0
)
# paste recomputed part into old image
# if x or y is zero it is a bit annoying
if
x
>
0
:
if
y
>
0
:
partial_integral
+=
(
integral
[
x
-
1
,
y
:]
-
integral
[
x
-
1
,
y
-
1
])
else
:
partial_integral
+=
integral
[
x
-
1
,
y
:]
if
y
>
0
:
partial_integral
+=
integral
[
x
:,
y
-
1
][:,
np
.
newaxis
]
integral
[
x
:,
y
:]
=
partial_integral
self
.
layout_
=
zip
(
words
,
font_sizes
,
positions
,
orientations
,
colors
)
return
self
.
layout_
def
process_text
(
self
,
text
):
"""Splits a long text into words, eliminates the stopwords.
Parameters
----------
text : string
The text to be processed.
Returns
-------
words : list of tuples (string, float)
Word tokens with associated frequency.
Notes
-----
There are better ways to do word tokenization, but I don't want to
include all those things.
"""
d
=
{}
flags
=
re
.
UNICODE
if
type
(
text
)
is
unicode
else
0
for
word
in
re
.
findall
(
r
"\w[\w']*"
,
text
,
flags
=
flags
):
if
word
.
isdigit
():
continue
word_lower
=
word
.
lower
()
if
word_lower
in
self
.
stopwords
:
continue
# Look in lowercase dict.
if
word_lower
in
d
:
d2
=
d
[
word_lower
]
else
:
d2
=
{}
d
[
word_lower
]
=
d2
# Look in any case dict.
d2
[
word
]
=
d2
.
get
(
word
,
0
)
+
1
d3
=
{}
for
d2
in
d
.
values
():
# Get the most popular case.
first
=
max
(
d2
.
iteritems
(),
key
=
item1
)[
0
]
d3
[
first
]
=
sum
(
d2
.
values
())
# merge plurals into the singular count (simple cases only)
for
key
in
d3
.
keys
():
if
key
.
endswith
(
's'
):
key_singular
=
key
[:
-
1
]
if
key_singular
in
d3
:
val_plural
=
d3
[
key
]
val_singular
=
d3
[
key_singular
]
d3
[
key_singular
]
=
val_singular
+
val_plural
del
d3
[
key
]
words
=
sorted
(
d3
.
iteritems
(),
key
=
item1
,
reverse
=
True
)
words
=
words
[:
self
.
max_features
]
maximum
=
float
(
max
(
d3
.
values
()))
for
i
,
(
word
,
count
)
in
enumerate
(
words
):
words
[
i
]
=
word
,
count
/
maximum
self
.
words_
=
words
return
words
def
generate
(
self
,
text
):
"""Convenience function that calls process_text and fit_words.
Returns
-------
self
"""
self
.
process_text
(
text
)
self
.
fit_words
(
self
.
words_
)
return
self
def
to_image
(
self
):
if
not
hasattr
(
self
,
"layout_"
):
raise
ValueError
(
"WordCloud has not been calculated, call generate first."
)
img
=
Image
.
new
(
"RGB"
,
(
self
.
width
*
self
.
scale
,
self
.
height
*
self
.
scale
))
draw
=
ImageDraw
.
Draw
(
img
)
for
(
word
,
count
),
font_size
,
position
,
orientation
,
color
in
self
.
layout_
:
font
=
ImageFont
.
truetype
(
self
.
font_path
,
font_size
*
self
.
scale
)
transposed_font
=
ImageFont
.
TransposedFont
(
font
,
orientation
=
orientation
)
draw
.
setfont
(
transposed_font
)
pos
=
(
position
[
1
]
*
self
.
scale
,
position
[
0
]
*
self
.
scale
)
draw
.
text
(
pos
,
word
,
fill
=
color
)
return
img
def
recolor
(
self
,
random_state
=
None
,
color_func
=
None
):
"""Recolor existing layout.
Applying a new coloring is much faster than generating the whole wordcloud.
Parameters
----------
random_state : RandomState or None, default=None
If not None, a fixed random state is used.
color_func : function or None, default=None
Function to generate new color from word count, font size, position
and orientation. If None, self.color_func is used.
Returns
-------
self
"""
if
color_func
is
None
:
color_func
=
self
.
color_func
self
.
layout_
=
[(
word
,
font_size
,
position
,
orientation
,
color_func
(
word
,
font_size
,
position
,
orientation
,
random_state
))
for
word
,
font_size
,
position
,
orientation
,
_
in
self
.
layout_
]
return
self
def
to_file
(
self
,
filename
):
"""Export to image file.
Parameters
----------
filename : string
Location to write to.
Returns
-------
self
"""
img
=
self
.
to_image
()
img
.
save
(
filename
)
return
self
def
to_array
(
self
):
"""Convert to numpy array.
Returns
-------
image : nd-array size (width, height, 3)
Word cloud image as numpy matrix.
"""
return
np
.
array
(
self
.
to_image
)
def
__asarray__
(
self
):
"""Convert to numpy array.
Returns
-------
image : nd-array size (width, height, 3)
Word cloud image as numpy matrix.
"""
return
self
.
toarray
()
def
to_html
(
self
):
raise
NotImplementedError
(
"FIXME!!!"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment