feat: Add option to remove default stopwords from word summary

CMG203 · fabclmnt · commit b32f97348aa9 · 2025-03-04T10:41:53.000-08:00
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -9,3 +9,4 @@ sphinx_rtd_theme>=0.4.3
 sphinx-autodoc-typehints>=1.10.3
 sphinx-multiversion>=0.2.3
 autodoc_pydantic
+nltk
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -7,4 +7,5 @@ nbval
 ipython<9
 pyarrow
 twine>=3.1.1
-kaggle
+kaggle
+nltk
diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 import pandas as pd
+from nltk.corpus import stopwords
+import nltk
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
@@ -18,6 +20,9 @@
 )
 
 
+nltk.download('stopwords')
+
+
 def get_character_counts_vc(vc: pd.Series) -> pd.Series:
     series = pd.Series(vc.index, index=vc)
     characters = series[series != ""].apply(list)
@@ -151,41 +156,58 @@ def unicode_summary_vc(vc: pd.Series) -> dict:
     return summary
 
 
-def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
+def word_summary_vc(
+    vc: pd.Series,
+    stop_words: List[str] = [],
+    remove_default_stopwords: bool = True,
+    keep_stopwords: List[str] = []
+) -> dict:
     """Count the number of occurrences of each individual word across
     all lines of the data Series, then sort from the word with the most
     occurrences to the word with the least occurrences. If a list of
-    stop words is given, they will be ignored.
+    stop words is given, they will be ignored, along with default
+    English stopwords if remove_default_stopwords is True.
 
     Args:
         vc: Series containing all unique categories as index and their
             frequency as value. Sorted from the most frequent down.
         stop_words: List of stop words to ignore, empty by default.
+        remove_default_stopwords: Boolean flag to decide if default
+            English stopwords should be removed, default is True.
+        keep_stopwords: List of stop words to keep, even if they are
+            part of the default or custom stop words.
 
     Returns:
         A dict containing the results as a Series with unique words as
-        index and the computed frequency as value
+        index and the computed frequency as value.
     """
-    # TODO: configurable lowercase/punctuation etc.
-    # TODO: remove punctuation in words
+    # Convert custom stop words to lowercase
+    stop_words = {word.lower() for word in stop_words}
+
+    # Merge default stop words if enabled
+    if remove_default_stopwords:
+        default_stop_words = set(stopwords.words('english'))
+        stop_words = stop_words.union(default_stop_words)
 
+    # Remove any words specified in keep_stopwords
+    stop_words -= set(word.lower() for word in keep_stopwords)
+
+    # Prepare series for word count
     series = pd.Series(vc.index, index=vc)
     word_lists = series.str.lower().str.split()
     words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
     word_counts = pd.Series(words.index, index=words)
-    # fix for pandas 1.0.5
     word_counts = word_counts[word_counts.index.notnull()]
     word_counts = word_counts.groupby(level=0, sort=False).sum()
     word_counts = word_counts.sort_values(ascending=False)
 
-    # Remove stop words
-    if len(stop_words) > 0:
-        stop_words = [x.lower() for x in stop_words]
-        word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
+    # Exclude stop words
+    word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
 
     return {"word_counts": word_counts} if not word_counts.empty else {}
 
 
+
 def length_summary_vc(vc: pd.Series) -> dict:
     series = pd.Series(vc.index, index=vc)
     length = series.str.len()
diff --git a/tests/unit/test_pandas/test_describe_categorical_pandas.py b/tests/unit/test_pandas/test_describe_categorical_pandas.py
@@ -1,23 +1,48 @@
 import pandas as pd
 import pytest
-
 from ydata_profiling.model.pandas.describe_categorical_pandas import word_summary_vc
 
 value_counts_w_words = pd.Series(index=["The dog", "is hungry"], data=[2, 1])
 
-
+# Test the basic word summary function
 def test_word_summary_vc():
     assert (
-        word_summary_vc(vc=value_counts_w_words)["word_counts"].to_dict()
+        word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=False)["word_counts"].to_dict()
         == pd.Series(index=["the", "dog", "is", "hungry"], data=[2, 2, 1, 1]).to_dict()
     )
 
-
-@pytest.mark.parametrize("stop_words", [["The"], ["the", "a"]])
+# Test word summary function with custom stop words
+@pytest.mark.parametrize("stop_words", [["the"], ["the", "a"]])
 def test_word_summary_vc_with_stop_words(stop_words):
     assert (
-        word_summary_vc(vc=value_counts_w_words, stop_words=stop_words)[
+        word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=False)[
             "word_counts"
         ].to_dict()
         == pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
     )
+
+# Test word summary function with default stopwords removed
+def test_word_summary_vc_with_default_stopwords():
+    assert (
+        word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True)["word_counts"].to_dict()
+        == pd.Series(index=["dog", "hungry"], data=[2, 1]).to_dict()
+    )
+
+# Test word summary function with both custom and default stop words
+@pytest.mark.parametrize(
+    "stop_words, expected",
+    [
+        (["dog"], {"hungry": 1}),  # Custom stop word "dog", "is" removed as a default stopword
+        (["the", "is"], {"dog": 2, "hungry": 1}),  # Custom stop words "the" and "is"
+    ],
+)
+def test_word_summary_vc_with_custom_and_default_stop_words(stop_words, expected):
+    result = word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=True)["word_counts"].to_dict()
+    assert result == expected
+
+# Test word summary function with keep_stopwords
+def test_word_summary_vc_with_keep_stopwords():
+    assert (
+        word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True, keep_stopwords=["is"])["word_counts"].to_dict()
+        == pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
+    )