Commit 16774180 authored by Andreas Mueller's avatar Andreas Mueller Committed by GitHub

fix collocation score computation on 2.7, remove words with empty cou… (#184)

* fix collocation score computation on 2.7, remove words with empty counts.

* copy dict keys on python3
parent bceab74a
......@@ -36,13 +36,15 @@ Namespaces are one honking great idea -- let's do more of those!
def test_collocations():
wc = WordCloud(collocations=False)
wc = WordCloud(collocations=False, stopwords=[])
wc.generate(THIS)
wc2 = WordCloud(collocations=True)
wc2 = WordCloud(collocations=True, stopwords=[])
wc2.generate(THIS)
assert_greater(len(wc2.words_), len(wc.words_))
assert_in("is better", wc2.words_)
assert_not_in("is better", wc.words_)
assert_not_in("way may", wc2.words_)
def test_plurals_numbers():
......
from __future__ import division
from itertools import tee
from operator import itemgetter
from collections import defaultdict
......@@ -54,10 +55,18 @@ def unigrams_and_bigrams(words, normalize_plurals=True):
word2 = standard_form[bigram[1].lower()]
if score(count, counts[word1], counts[word2], n_words) > 30:
# bigram is a collocation
# discount words in unigrams dict. hack because one word might
# appear in multiple collocations at the same time
# (leading to negative counts)
counts_unigrams[word1] -= counts_bigrams[bigram_string]
counts_unigrams[word2] -= counts_bigrams[bigram_string]
# add joined bigram into unigrams
counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
words = list(counts_unigrams.keys())
for word in words:
# remove empty / negative counts
if counts_unigrams[word] <= 0:
del counts_unigrams[word]
return counts_unigrams
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment