Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
keeleliin
word_cloud_est
Commits
2577c429
Commit
2577c429
authored
Sep 22, 2014
by
Andreas Mueller
Browse files
Make things deterministic, add tests for masks.
parent
f73deb4a
Changes
4
Hide whitespace changes
Inline
Side-by-side
test/test_wordcloud.py
View file @
2577c429
from
wordcloud
import
WordCloud
import
numpy
as
np
from
nose.tools
import
assert_equal
from
nose.tools
import
assert_equal
,
assert_greater
,
assert_true
,
assert_raises
from
numpy.testing
import
assert_array_equal
from
PIL
import
Image
from
tempfile
import
NamedTemporaryFile
THIS
=
"""The Zen of Python, by Tim Peters
...
...
@@ -28,6 +31,7 @@ Namespaces are one honking great idea -- let's do more of those!
def
test_default
():
# test that default word cloud creation and conversions work
wc
=
WordCloud
(
max_words
=
50
)
wc
.
generate
(
THIS
)
...
...
@@ -49,13 +53,79 @@ def test_default():
assert_equal
(
wc_array
.
shape
,
(
wc
.
height
,
wc
.
width
,
3
))
def
check_errors
():
pass
def
test_writing_to_file
():
wc
=
WordCloud
()
wc
.
generate
(
THIS
)
# check writing to file
f
=
NamedTemporaryFile
(
suffix
=
".png"
)
filename
=
f
.
name
wc
.
to_file
(
filename
)
loaded_image
=
Image
.
open
(
filename
)
assert_equal
(
loaded_image
.
size
,
(
wc
.
width
,
wc
.
height
))
def
test_check_errors
():
wc
=
WordCloud
()
assert_raises
(
NotImplementedError
,
wc
.
to_html
)
try
:
np
.
array
(
wc
)
raise
AssertionError
(
"np.array(wc) didn't raise"
)
except
ValueError
as
e
:
assert_true
(
"call generate"
in
str
(
e
))
try
:
wc
.
recolor
()
raise
AssertionError
(
"wc.recolor didn't raise"
)
except
ValueError
as
e
:
assert_true
(
"call generate"
in
str
(
e
))
def
test_recolor
():
pass
wc
=
WordCloud
(
max_words
=
50
)
wc
.
generate
(
THIS
)
array_before
=
wc
.
to_array
()
wc
.
recolor
()
array_after
=
wc
.
to_array
()
# check that the same places are filled
assert_array_equal
(
array_before
.
sum
(
axis
=-
1
)
!=
0
,
array_after
.
sum
(
axis
=-
1
)
!=
0
)
# check that they are not the same
assert_greater
(
np
.
abs
(
array_before
-
array_after
).
sum
(),
10000
)
def
test_random_state
():
# check that random state makes everything deterministic
wc
=
WordCloud
(
random_state
=
0
)
wc2
=
WordCloud
(
random_state
=
0
)
wc
.
generate
(
THIS
)
wc2
.
generate
(
THIS
)
assert_array_equal
(
wc
,
wc2
)
def
test_mask
():
# test masks
# check that using an empty mask is equivalent to not using a mask
wc
=
WordCloud
(
random_state
=
42
)
wc
.
generate
(
THIS
)
mask
=
np
.
zeros
(
np
.
array
(
wc
).
shape
[:
2
])
wc_mask
=
WordCloud
(
mask
=
mask
,
random_state
=
42
)
wc_mask
.
generate
(
THIS
)
assert_array_equal
(
wc
,
wc_mask
)
# use actual nonzero mask
mask
=
np
.
zeros
((
234
,
456
))
mask
[
100
:
150
,
300
:
400
]
=
1
wc
=
WordCloud
(
mask
=
mask
)
wc
.
generate
(
THIS
)
wc_array
=
np
.
array
(
wc
)
assert_equal
(
mask
.
shape
,
wc_array
.
shape
[:
2
])
assert_array_equal
(
wc_array
[
mask
!=
0
],
0
)
assert_greater
(
wc_array
[
mask
==
0
].
sum
(),
10000
)
def
check_parameters
():
# check that parameters are actually used
pass
wordcloud/TODO
View file @
2577c429
* command line interface
* easy access to image, numpy array
* html export
* good notebook interface
* recoloring support
* by default differnt color schemes
* examples
* unit tests
* website
* docstrings
* deterministic functionality
* long functions?
* no dependency on sklearn any more?
* redo examples
* examples
* filter one-letter words
wordcloud/query_integral_image.pyx
View file @
2577c429
...
...
@@ -4,7 +4,7 @@ import array
import
numpy
as
np
def
query_integral_image
(
unsigned
int
[:,:]
integral_image
,
int
size_x
,
int
size_y
):
def
query_integral_image
(
unsigned
int
[:,:]
integral_image
,
int
size_x
,
int
size_y
,
random_state
):
cdef
int
x
=
integral_image
.
shape
[
0
]
cdef
int
y
=
integral_image
.
shape
[
1
]
cdef
int
area
,
i
,
j
...
...
@@ -21,7 +21,7 @@ def query_integral_image(unsigned int[:,:] integral_image, int size_x, int size_
# no room left
return
None
# pick a location at random
cdef
int
goal
=
np
.
random
.
randint
(
hits
)
cdef
int
goal
=
random
_state
.
randint
(
0
,
hits
)
hits
=
0
for
i
in
xrange
(
x
-
size_x
):
for
j
in
xrange
(
y
-
size_y
):
...
...
wordcloud/wordcloud.py
View file @
2577c429
...
...
@@ -4,7 +4,7 @@
#
# License: MIT
import
r
andom
from
random
import
R
andom
import
os
import
re
import
numpy
as
np
...
...
@@ -24,7 +24,7 @@ STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__),
def
random_color_func
(
word
,
font_size
,
position
,
orientation
,
random_state
=
None
):
if
random_state
is
None
:
random_state
=
random
.
Random
()
random_state
=
Random
()
return
"hsl(%d, 80%%, 50%%)"
%
random_state
.
randint
(
0
,
255
)
...
...
@@ -50,9 +50,10 @@ class WordCloud(object):
The ratio of times to try horizontal fitting as opposed to vertical.
mask : nd-array or None (default=None)
If not None, gives a binary mask on where to draw words. In this case,
width and height will be ignored and the shape of mask will be used
instead.
If not None, gives a binary mask on where to draw words. All zero
entries will be considered "free" to draw on, while all non-zero
entries will be deemed occupied. If mask is not None, width and height will be
ignored and the shape of mask will be used instead.
max_words : number (default=200)
The maximum number of words.
...
...
@@ -72,7 +73,7 @@ class WordCloud(object):
def
__init__
(
self
,
font_path
=
None
,
width
=
400
,
height
=
200
,
margin
=
5
,
ranks_only
=
False
,
prefer_horizontal
=
0.9
,
mask
=
None
,
scale
=
1
,
color_func
=
random_color_func
,
max_words
=
200
,
stopwords
=
None
):
color_func
=
random_color_func
,
max_words
=
200
,
stopwords
=
None
,
random_state
=
None
):
if
stopwords
is
None
:
stopwords
=
STOPWORDS
if
font_path
is
None
:
...
...
@@ -88,8 +89,11 @@ class WordCloud(object):
self
.
color_func
=
color_func
self
.
max_words
=
max_words
self
.
stopwords
=
stopwords
if
isinstance
(
random_state
,
int
):
random_state
=
Random
(
random_state
)
self
.
random_state
=
random_state
def
fit_words
(
self
,
words
):
def
_
fit_words
(
self
,
words
):
"""Generate the positions for words.
Parameters
...
...
@@ -113,6 +117,10 @@ class WordCloud(object):
relative differences don't matter. Play with setting the font_size in the
main loop for different styles.
"""
if
self
.
random_state
is
not
None
:
random_state
=
self
.
random_state
else
:
random_state
=
Random
()
if
len
(
words
)
<=
0
:
print
(
"We need at least 1 word to plot a word cloud, got %d."
...
...
@@ -145,7 +153,7 @@ class WordCloud(object):
# try to find a position
font
=
ImageFont
.
truetype
(
self
.
font_path
,
font_size
)
# transpose font optionally
if
random
.
random
()
<
self
.
prefer_horizontal
:
if
random
_state
.
random
()
<
self
.
prefer_horizontal
:
orientation
=
None
else
:
orientation
=
Image
.
ROTATE_90
...
...
@@ -156,7 +164,7 @@ class WordCloud(object):
box_size
=
draw
.
textsize
(
word
)
# find possible places using integral image:
result
=
query_integral_image
(
integral
,
box_size
[
1
]
+
self
.
margin
,
box_size
[
0
]
+
self
.
margin
)
box_size
[
0
]
+
self
.
margin
,
random_state
)
if
result
is
not
None
or
font_size
==
0
:
break
# if we didn't find a place, make font smaller
...
...
@@ -172,7 +180,8 @@ class WordCloud(object):
positions
.
append
((
x
,
y
))
orientations
.
append
(
orientation
)
font_sizes
.
append
(
font_size
)
colors
.
append
(
self
.
color_func
(
word
,
font_size
,
(
x
,
y
),
orientation
))
colors
.
append
(
self
.
color_func
(
word
,
font_size
,
(
x
,
y
),
orientation
,
random_state
=
random_state
))
# recompute integral image
if
self
.
mask
is
None
:
img_array
=
np
.
asarray
(
img_grey
)
...
...
@@ -198,7 +207,7 @@ class WordCloud(object):
self
.
layout_
=
zip
(
words
,
font_sizes
,
positions
,
orientations
,
colors
)
return
self
.
layout_
def
process_text
(
self
,
text
):
def
_
process_text
(
self
,
text
):
"""Splits a long text into words, eliminates the stopwords.
Parameters
...
...
@@ -265,20 +274,32 @@ class WordCloud(object):
return
words
def
generate
(
self
,
text
):
"""Convenience function that calls process_text and fit_words.
"""Generate wordcloud from text.
Calls _process_text and _fit_words.
Returns
-------
self
"""
self
.
process_text
(
text
)
self
.
fit_words
(
self
.
words_
)
self
.
_
process_text
(
text
)
self
.
_
fit_words
(
self
.
words_
)
return
self
def
to_image
(
self
):
def
_check_generated
(
self
):
"""Check if layout_ was computed, otherwise raise error."""
if
not
hasattr
(
self
,
"layout_"
):
raise
ValueError
(
"WordCloud has not been calculated, call generate first."
)
img
=
Image
.
new
(
"RGB"
,
(
self
.
width
*
self
.
scale
,
self
.
height
*
self
.
scale
))
def
to_image
(
self
):
self
.
_check_generated
()
if
self
.
mask
is
not
None
:
width
=
self
.
mask
.
shape
[
1
]
height
=
self
.
mask
.
shape
[
0
]
else
:
height
,
width
=
self
.
height
,
self
.
width
img
=
Image
.
new
(
"RGB"
,
(
width
*
self
.
scale
,
height
*
self
.
scale
))
draw
=
ImageDraw
.
Draw
(
img
)
for
(
word
,
count
),
font_size
,
position
,
orientation
,
color
in
self
.
layout_
:
font
=
ImageFont
.
truetype
(
self
.
font_path
,
font_size
*
self
.
scale
)
...
...
@@ -296,8 +317,9 @@ class WordCloud(object):
Parameters
----------
random_state : RandomState or None, default=None
If not None, a fixed random state is used.
random_state : RandomState, int, or None, default=None
If not None, a fixed random state is used. If an int is given, this
is used as seed for a random.Random state.
color_func : function or None, default=None
Function to generate new color from word count, font size, position
...
...
@@ -307,6 +329,7 @@ class WordCloud(object):
-------
self
"""
self
.
_check_generated
()
if
color_func
is
None
:
color_func
=
self
.
color_func
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment