You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
42 lines
593 B
42 lines
593 B
# Performance note: I benchmarked this code using a set instead of |
|
# a list for the stopwords and was surprised to find that the list |
|
# performed /better/ than the set - maybe because it's only a small |
|
# list. |
|
|
|
stopwords = ''' |
|
i |
|
a |
|
an |
|
are |
|
as |
|
at |
|
be |
|
by |
|
for |
|
from |
|
how |
|
in |
|
is |
|
it |
|
of |
|
on |
|
or |
|
that |
|
the |
|
this |
|
to |
|
was |
|
what |
|
when |
|
where |
|
'''.split() |
|
|
|
def strip_stopwords(sentence): |
|
"Removes stopwords - also normalizes whitespace" |
|
words = sentence.split() |
|
sentence = [] |
|
for word in words: |
|
if word.lower() not in stopwords: |
|
sentence.append(word) |
|
return u' '.join(sentence) |
|
|
|
|