|
5 | 5 |
|
6 | 6 | import numpy as np |
7 | 7 | import pandas as pd |
| 8 | +from nltk.corpus import stopwords |
| 9 | +import nltk |
8 | 10 |
|
9 | 11 | from ydata_profiling.config import Settings |
10 | 12 | from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score |
|
18 | 20 | ) |
19 | 21 |
|
20 | 22 |
|
| 23 | +nltk.download('stopwords') |
| 24 | + |
| 25 | + |
21 | 26 | def get_character_counts_vc(vc: pd.Series) -> pd.Series: |
22 | 27 | series = pd.Series(vc.index, index=vc) |
23 | 28 | characters = series[series != ""].apply(list) |
@@ -151,41 +156,58 @@ def unicode_summary_vc(vc: pd.Series) -> dict: |
151 | 156 | return summary |
152 | 157 |
|
153 | 158 |
|
154 | | -def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict: |
| 159 | +def word_summary_vc( |
| 160 | + vc: pd.Series, |
| 161 | + stop_words: List[str] = [], |
| 162 | + remove_default_stopwords: bool = True, |
| 163 | + keep_stopwords: List[str] = [] |
| 164 | +) -> dict: |
155 | 165 | """Count the number of occurrences of each individual word across |
156 | 166 | all lines of the data Series, then sort from the word with the most |
157 | 167 | occurrences to the word with the least occurrences. If a list of |
158 | | - stop words is given, they will be ignored. |
| 168 | + stop words is given, they will be ignored, along with default |
| 169 | + English stopwords if remove_default_stopwords is True. |
159 | 170 |
|
160 | 171 | Args: |
161 | 172 | vc: Series containing all unique categories as index and their |
162 | 173 | frequency as value. Sorted from the most frequent down. |
163 | 174 | stop_words: List of stop words to ignore, empty by default. |
| 175 | + remove_default_stopwords: Boolean flag to decide if default |
| 176 | + English stopwords should be removed, default is True. |
| 177 | + keep_stopwords: List of stop words to keep, even if they are |
| 178 | + part of the default or custom stop words. |
164 | 179 |
|
165 | 180 | Returns: |
166 | 181 | A dict containing the results as a Series with unique words as |
167 | | - index and the computed frequency as value |
| 182 | + index and the computed frequency as value. |
168 | 183 | """ |
169 | | - # TODO: configurable lowercase/punctuation etc. |
170 | | - # TODO: remove punctuation in words |
| 184 | + # Convert custom stop words to lowercase |
| 185 | + stop_words = {word.lower() for word in stop_words} |
| 186 | + |
| 187 | + # Merge default stop words if enabled |
| 188 | + if remove_default_stopwords: |
| 189 | + default_stop_words = set(stopwords.words('english')) |
| 190 | + stop_words = stop_words.union(default_stop_words) |
171 | 191 |
|
| 192 | + # Remove any words specified in keep_stopwords |
| 193 | + stop_words -= set(word.lower() for word in keep_stopwords) |
| 194 | + |
| 195 | + # Prepare series for word count |
172 | 196 | series = pd.Series(vc.index, index=vc) |
173 | 197 | word_lists = series.str.lower().str.split() |
174 | 198 | words = word_lists.explode().str.strip(string.punctuation + string.whitespace) |
175 | 199 | word_counts = pd.Series(words.index, index=words) |
176 | | - # fix for pandas 1.0.5 |
177 | 200 | word_counts = word_counts[word_counts.index.notnull()] |
178 | 201 | word_counts = word_counts.groupby(level=0, sort=False).sum() |
179 | 202 | word_counts = word_counts.sort_values(ascending=False) |
180 | 203 |
|
181 | | - # Remove stop words |
182 | | - if len(stop_words) > 0: |
183 | | - stop_words = [x.lower() for x in stop_words] |
184 | | - word_counts = word_counts.loc[~word_counts.index.isin(stop_words)] |
| 204 | + # Exclude stop words |
| 205 | + word_counts = word_counts.loc[~word_counts.index.isin(stop_words)] |
185 | 206 |
|
186 | 207 | return {"word_counts": word_counts} if not word_counts.empty else {} |
187 | 208 |
|
188 | 209 |
|
| 210 | + |
189 | 211 | def length_summary_vc(vc: pd.Series) -> dict: |
190 | 212 | series = pd.Series(vc.index, index=vc) |
191 | 213 | length = series.str.len() |
|
0 commit comments