Skip to content

Commit b32f973

Browse files
CMG203fabclmnt
authored andcommitted
feat: Add option to remove default stopwords from word summary
1 parent 0ed32b7 commit b32f973

File tree

4 files changed

+66
-17
lines changed

4 files changed

+66
-17
lines changed

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ sphinx_rtd_theme>=0.4.3
99
sphinx-autodoc-typehints>=1.10.3
1010
sphinx-multiversion>=0.2.3
1111
autodoc_pydantic
12+
nltk

requirements-test.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ nbval
77
ipython<9
88
pyarrow
99
twine>=3.1.1
10-
kaggle
10+
kaggle
11+
nltk

src/ydata_profiling/model/pandas/describe_categorical_pandas.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
import numpy as np
77
import pandas as pd
8+
from nltk.corpus import stopwords
9+
import nltk
810

911
from ydata_profiling.config import Settings
1012
from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
@@ -18,6 +20,9 @@
1820
)
1921

2022

23+
nltk.download('stopwords')
24+
25+
2126
def get_character_counts_vc(vc: pd.Series) -> pd.Series:
2227
series = pd.Series(vc.index, index=vc)
2328
characters = series[series != ""].apply(list)
@@ -151,41 +156,58 @@ def unicode_summary_vc(vc: pd.Series) -> dict:
151156
return summary
152157

153158

154-
def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
159+
def word_summary_vc(
160+
vc: pd.Series,
161+
stop_words: List[str] = [],
162+
remove_default_stopwords: bool = True,
163+
keep_stopwords: List[str] = []
164+
) -> dict:
155165
"""Count the number of occurrences of each individual word across
156166
all lines of the data Series, then sort from the word with the most
157167
occurrences to the word with the least occurrences. If a list of
158-
stop words is given, they will be ignored.
168+
stop words is given, they will be ignored, along with default
169+
English stopwords if remove_default_stopwords is True.
159170
160171
Args:
161172
vc: Series containing all unique categories as index and their
162173
frequency as value. Sorted from the most frequent down.
163174
stop_words: List of stop words to ignore, empty by default.
175+
remove_default_stopwords: Boolean flag to decide if default
176+
English stopwords should be removed, default is True.
177+
keep_stopwords: List of stop words to keep, even if they are
178+
part of the default or custom stop words.
164179
165180
Returns:
166181
A dict containing the results as a Series with unique words as
167-
index and the computed frequency as value
182+
index and the computed frequency as value.
168183
"""
169-
# TODO: configurable lowercase/punctuation etc.
170-
# TODO: remove punctuation in words
184+
# Convert custom stop words to lowercase
185+
stop_words = {word.lower() for word in stop_words}
186+
187+
# Merge default stop words if enabled
188+
if remove_default_stopwords:
189+
default_stop_words = set(stopwords.words('english'))
190+
stop_words = stop_words.union(default_stop_words)
171191

192+
# Remove any words specified in keep_stopwords
193+
stop_words -= set(word.lower() for word in keep_stopwords)
194+
195+
# Prepare series for word count
172196
series = pd.Series(vc.index, index=vc)
173197
word_lists = series.str.lower().str.split()
174198
words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
175199
word_counts = pd.Series(words.index, index=words)
176-
# fix for pandas 1.0.5
177200
word_counts = word_counts[word_counts.index.notnull()]
178201
word_counts = word_counts.groupby(level=0, sort=False).sum()
179202
word_counts = word_counts.sort_values(ascending=False)
180203

181-
# Remove stop words
182-
if len(stop_words) > 0:
183-
stop_words = [x.lower() for x in stop_words]
184-
word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
204+
# Exclude stop words
205+
word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
185206

186207
return {"word_counts": word_counts} if not word_counts.empty else {}
187208

188209

210+
189211
def length_summary_vc(vc: pd.Series) -> dict:
190212
series = pd.Series(vc.index, index=vc)
191213
length = series.str.len()
Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,48 @@
11
import pandas as pd
22
import pytest
3-
43
from ydata_profiling.model.pandas.describe_categorical_pandas import word_summary_vc
54

65
value_counts_w_words = pd.Series(index=["The dog", "is hungry"], data=[2, 1])
76

8-
7+
# Test the basic word summary function
98
def test_word_summary_vc():
109
assert (
11-
word_summary_vc(vc=value_counts_w_words)["word_counts"].to_dict()
10+
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=False)["word_counts"].to_dict()
1211
== pd.Series(index=["the", "dog", "is", "hungry"], data=[2, 2, 1, 1]).to_dict()
1312
)
1413

15-
16-
@pytest.mark.parametrize("stop_words", [["The"], ["the", "a"]])
14+
# Test word summary function with custom stop words
15+
@pytest.mark.parametrize("stop_words", [["the"], ["the", "a"]])
1716
def test_word_summary_vc_with_stop_words(stop_words):
1817
assert (
19-
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words)[
18+
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=False)[
2019
"word_counts"
2120
].to_dict()
2221
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
2322
)
23+
24+
# Test word summary function with default stopwords removed
25+
def test_word_summary_vc_with_default_stopwords():
26+
assert (
27+
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True)["word_counts"].to_dict()
28+
== pd.Series(index=["dog", "hungry"], data=[2, 1]).to_dict()
29+
)
30+
31+
# Test word summary function with both custom and default stop words
32+
@pytest.mark.parametrize(
33+
"stop_words, expected",
34+
[
35+
(["dog"], {"hungry": 1}), # Custom stop word "dog", "is" removed as a default stopword
36+
(["the", "is"], {"dog": 2, "hungry": 1}), # Custom stop words "the" and "is"
37+
],
38+
)
39+
def test_word_summary_vc_with_custom_and_default_stop_words(stop_words, expected):
40+
result = word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=True)["word_counts"].to_dict()
41+
assert result == expected
42+
43+
# Test word summary function with keep_stopwords
44+
def test_word_summary_vc_with_keep_stopwords():
45+
assert (
46+
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True, keep_stopwords=["is"])["word_counts"].to_dict()
47+
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
48+
)

0 commit comments

Comments
 (0)