Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
W
word_cloud_est
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
keeleliin
word_cloud_est
Commits
bc8e76ef
Commit
bc8e76ef
authored
Jun 29, 2016
by
Andreas Mueller
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
slight cleanup, seems to work well.
parent
a0c8b23a
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
35 additions
and
6 deletions
+35
-6
examples/bigrams.py
examples/bigrams.py
+35
-6
No files found.
examples/bigrams.py
View file @
bc8e76ef
...
...
@@ -10,8 +10,15 @@ The ``from_frequencies`` method allows generating wordclouds from a list or
array of ``(word, frequency)`` tuples, where ``word`` can be any string, and
``frequency`` can be any int or float.
We are using the likelihood ratio score developed by Dunning to find "collocations",
which are phrases made up of two or more words (we only consider two here).
If the chance that a bigram is a collocation is high, we discount the appearances
of the single words -- otherwise they would always be at least as big as the bigram.
"""
import
numpy
as
np
from
PIL
import
Image
from
os
import
path
...
...
@@ -20,10 +27,31 @@ import random
from
itertools
import
tee
from
collections
import
defaultdict
import
re
from
math
import
log
from
wordcloud
import
WordCloud
,
STOPWORDS
# dunning's likelihood ratio with notation from
# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
def
l
(
k
,
n
,
x
):
return
log
(
max
(
x
,
1e-10
))
*
k
+
log
(
max
(
1
-
x
,
1e-10
))
*
(
n
-
k
)
def
score
(
bigram
,
counts
,
n_words
):
N
=
n_words
c12
=
counts
[
bigram
]
c1
=
counts
[
bigram
[
0
]]
c2
=
counts
[
bigram
[
1
]]
p
=
c2
/
N
p1
=
c12
/
c1
p2
=
(
c2
-
c12
)
/
(
N
-
c1
)
score
=
l
(
c12
,
c1
,
p
)
+
l
(
c2
-
c12
,
N
-
c1
,
p
)
-
l
(
c12
,
c1
,
p1
)
-
l
(
c2
-
c12
,
N
-
c1
,
p2
)
return
-
2
*
score
def
grey_color_func
(
word
,
font_size
,
position
,
orientation
,
random_state
=
None
,
**
kwargs
):
return
"hsl(0, 0%%, %d%%)"
%
random
.
randint
(
60
,
100
)
...
...
@@ -35,6 +63,7 @@ def pairwise(iterable):
next
(
b
,
None
)
return
zip
(
a
,
b
)
def
unigrams_and_bigrams
(
text
,
stopwords
=
None
):
stopwords
=
[
s
.
lower
()
for
s
in
stopwords
]
if
stopwords
is
not
None
else
[]
words
=
re
.
findall
(
r
"\w[\w']+"
,
text
)
...
...
@@ -54,19 +83,19 @@ def unigrams_and_bigrams(text, stopwords=None):
for
bigram
in
bigrams
:
# join tuples by a space
counts_bigrams
[
bigram
]
+=
1
counts_all
=
{}
counts_all
.
update
(
counts_unigrams
)
counts_all
.
update
(
counts_bigrams
)
counts_all
.
update
(
counts_bigrams
)
# decount words inside bigrams
for
bigram
in
counts_bigrams
()
.
keys
():
for
bigram
in
counts_bigrams
.
keys
():
# collocation detection (30 is arbitrary):
if
score
(
bigram
,
counts_all
,
n_words
)
>
30
:
counts_unigrams
[
bigram
[
0
]]
-=
counts_bigrams
[
bigram
]
counts_unigrams
[
bigram
[
1
]]
-=
counts_bigrams
[
bigram
]
# add joined bigram into unigrams
counts_unigrams
[
' '
.
join
(
bigram
)]
=
counts_bigram
[
bigram
]
counts_unigrams
[
' '
.
join
(
bigram
)]
=
counts_bigram
s
[
bigram
]
return
counts_unigrams
...
...
@@ -91,6 +120,6 @@ wc = WordCloud(max_words=1000, mask=mask, margin=10,
# from_freqencies ignores "stopwords" so we have to do it ourselves
wc
.
generate_from_frequencies
(
unigrams_and_bigrams
(
text
,
STOPWORDS
).
items
())
plt
.
imshow
(
wc
)
wc
.
to_file
(
"a_new_hope.png"
)
wc
.
to_file
(
"a_new_hope
_bigrams
.png"
)
plt
.
axis
(
"off"
)
plt
.
show
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment