Skip to content

Commit 471d3d8

Browse files
committed
Add the donut chart function to Corpus class.
1 parent 36180d0 commit 471d3d8

File tree

2 files changed

+185
-3
lines changed

2 files changed

+185
-3
lines changed

examples.ipynb

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
"outputs": [],
9090
"source": [
9191
"mystopwords = stopwords.words('english')+['ax','edu','com','would','nntp','ac','co','gv','bf','db','tin','apr','gmt','na','pl','di','inc','gov','max','acs','cs',\n",
92-
" 'subject','lines','organization','writes','article','one','posting','host','ca','also','too']"
92+
" 'subject','lines','organization','writes','article','one','posting','host','ca','also','too','maxaxaxaxaxaxaxaxaxaxaxaxaxaxax']"
9393
]
9494
},
9595
{
@@ -115,6 +115,13 @@
115115
"### Frequencies"
116116
]
117117
},
118+
{
119+
"cell_type": "markdown",
120+
"metadata": {},
121+
"source": [
122+
"#### Matplotlib"
123+
]
124+
},
118125
{
119126
"cell_type": "code",
120127
"execution_count": null,
@@ -142,6 +149,13 @@
142149
"tv.frequencyPlot(listText=df.text_clean.to_list(), ngramRange=(3,3), stopwords=mystopwords)"
143150
]
144151
},
152+
{
153+
"cell_type": "markdown",
154+
"metadata": {},
155+
"source": [
156+
"#### Plotly"
157+
]
158+
},
145159
{
146160
"cell_type": "code",
147161
"execution_count": null,
@@ -172,6 +186,13 @@
172186
"fig.show()"
173187
]
174188
},
189+
{
190+
"cell_type": "markdown",
191+
"metadata": {},
192+
"source": [
193+
"#### Yellowbrick"
194+
]
195+
},
175196
{
176197
"cell_type": "code",
177198
"execution_count": null,
@@ -199,6 +220,13 @@
199220
"tv.frequencyPlotYellowbrick(listText=df.text_clean.to_list(), ngramRange=(3,3), stopwords=mystopwords)"
200221
]
201222
},
223+
{
224+
"cell_type": "markdown",
225+
"metadata": {},
226+
"source": [
227+
"#### Tree map"
228+
]
229+
},
202230
{
203231
"cell_type": "code",
204232
"execution_count": null,
@@ -229,6 +257,49 @@
229257
"fig.show()"
230258
]
231259
},
260+
{
261+
"cell_type": "markdown",
262+
"metadata": {},
263+
"source": [
264+
"### Donut chart"
265+
]
266+
},
267+
{
268+
"cell_type": "code",
269+
"execution_count": null,
270+
"metadata": {
271+
"tags": []
272+
},
273+
"outputs": [],
274+
"source": [
275+
"fig = tv.frequencyDonutChart(listText=df.text_clean.to_list(), stopwords=mystopwords)\n",
276+
"fig.show()"
277+
]
278+
},
279+
{
280+
"cell_type": "code",
281+
"execution_count": null,
282+
"metadata": {
283+
"tags": []
284+
},
285+
"outputs": [],
286+
"source": [
287+
"fig = tv.frequencyDonutChart(listText=df.text_clean.to_list(), ngramRange=(2,2), stopwords=mystopwords)\n",
288+
"fig.show()"
289+
]
290+
},
291+
{
292+
"cell_type": "code",
293+
"execution_count": null,
294+
"metadata": {
295+
"tags": []
296+
},
297+
"outputs": [],
298+
"source": [
299+
"fig = tv.frequencyDonutChart(listText=df.text_clean.to_list(), ngramRange=(3,3), stopwords=mystopwords)\n",
300+
"fig.show()"
301+
]
302+
},
232303
{
233304
"cell_type": "markdown",
234305
"metadata": {},
@@ -503,6 +574,13 @@
503574
"### Frequency"
504575
]
505576
},
577+
{
578+
"cell_type": "markdown",
579+
"metadata": {},
580+
"source": [
581+
"#### Matplotlib"
582+
]
583+
},
506584
{
507585
"cell_type": "code",
508586
"execution_count": null,
@@ -530,6 +608,13 @@
530608
"c.frequencyPlot(stopwords=mystopwords, labels = [\"rec.autos\",\"rec.motorcycles\"])"
531609
]
532610
},
611+
{
612+
"cell_type": "markdown",
613+
"metadata": {},
614+
"source": [
615+
"#### Plotly"
616+
]
617+
},
533618
{
534619
"cell_type": "code",
535620
"execution_count": null,
@@ -557,6 +642,13 @@
557642
"c.frequencyPlot(stopwords=mystopwords, labels = [\"rec.autos\",\"rec.motorcycles\"], package = 'plotly')"
558643
]
559644
},
645+
{
646+
"cell_type": "markdown",
647+
"metadata": {},
648+
"source": [
649+
"#### Yellowbrick"
650+
]
651+
},
560652
{
561653
"cell_type": "code",
562654
"execution_count": null,
@@ -584,6 +676,13 @@
584676
"c.frequencyPlot(stopwords=mystopwords, labels = [\"rec.autos\",\"rec.motorcycles\"], package = 'yellowbrick')"
585677
]
586678
},
679+
{
680+
"cell_type": "markdown",
681+
"metadata": {},
682+
"source": [
683+
"#### Tree Map"
684+
]
685+
},
587686
{
588687
"cell_type": "code",
589688
"execution_count": null,
@@ -620,6 +719,46 @@
620719
"c.frequencyTreeMap(stopwords=mystopwords, ngramRange=(2,2), labels = [\"rec.autos\",\"rec.motorcycles\"])"
621720
]
622721
},
722+
{
723+
"cell_type": "markdown",
724+
"metadata": {},
725+
"source": [
726+
"### Donut chart"
727+
]
728+
},
729+
{
730+
"cell_type": "code",
731+
"execution_count": null,
732+
"metadata": {
733+
"tags": []
734+
},
735+
"outputs": [],
736+
"source": [
737+
"c.frequencyDonutChart(stopwords=mystopwords)"
738+
]
739+
},
740+
{
741+
"cell_type": "code",
742+
"execution_count": null,
743+
"metadata": {
744+
"tags": []
745+
},
746+
"outputs": [],
747+
"source": [
748+
"c.frequencyDonutChart(ngramRange=(2,2), stopwords=mystopwords)"
749+
]
750+
},
751+
{
752+
"cell_type": "code",
753+
"execution_count": null,
754+
"metadata": {
755+
"tags": []
756+
},
757+
"outputs": [],
758+
"source": [
759+
"c.frequencyDonutChart(ngramRange=(3,3), stopwords=mystopwords)"
760+
]
761+
},
623762
{
624763
"cell_type": "markdown",
625764
"metadata": {},

textvisualizer/corpus/corpus.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def frequencyTreeMap(self, number_of_words=100, stopwords=None, ngramRange=(1, 1
180180
given, a vocabulary is determined from the input documents. Indices
181181
in the mapping should not be repeated and should not have any gap
182182
between 0 and the largest index.
183-
183+
184184
labels : str or list of str, default=None
185185
Labels to be used to filter the text.
186186
@@ -194,7 +194,7 @@ def frequencyTreeMap(self, number_of_words=100, stopwords=None, ngramRange=(1, 1
194194
else:
195195
df = self.__mountDataframe(labels=labels)
196196
return frequencyTreeMap(df.text.tolist(), number_of_words=number_of_words, stopwords=stopwords, ngramRange=ngramRange, vocabulary=vocabulary)
197-
197+
198198
def phraseNet(self, connectors, number_of_pairs=20, labels=None, plotly=False):
199199
"""
200200
Plot the Phrase net of a list of texts.
@@ -401,3 +401,46 @@ def bigramGraph(self, stopwords=None, labels=None, total_bigrams=15):
401401
else:
402402
df = self.__mountDataframe(labels=labels)
403403
return bigramGraph(df.text.tolist(), stopwords=stopwords, total_bigrams=total_bigrams)
404+
405+
def frequencyDonutChart(self, number_of_words=20, stopwords=None, ngramRange=(1, 1), vocabulary=None, labels=None):
406+
"""
407+
This function takes a text as input and plots a donut chart with the word frequencies using Plotly.
408+
409+
It uses plotly express under the hood.
410+
411+
Parameters
412+
----------
413+
number_of_words : int
414+
Number of words to be plotted.
415+
416+
stopwords : list of strings, default=None
417+
That list is assumed to contain stop words, all of which will be removed from the resulting tokens.
418+
419+
ngramRange : tuple (min_n, max_n), default=(1, 1)
420+
The lower and upper boundary of the range of n-values for different
421+
word n-grams or char n-grams to be extracted. All values of n such
422+
such that min_n <= n <= max_n will be used. For example an
423+
``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means
424+
unigrams and bigrams, and ``(2, 2)`` means only bigrams.
425+
Only applies if ``analyzer is not callable``.
426+
427+
vocabulary : Mapping or iterable, default=None
428+
Either a Mapping (e.g., a dict) where keys are terms and values are
429+
indices in the feature matrix, or an iterable over terms. If not
430+
given, a vocabulary is determined from the input documents. Indices
431+
in the mapping should not be repeated and should not have any gap
432+
between 0 and the largest index.
433+
434+
labels : str or list of str, default=None
435+
Labels to be used to filter the text.
436+
437+
Returns
438+
-------
439+
plotly.graph_objs._figure.Figure
440+
"""
441+
if labels is None:
442+
return frequencyDonutChart(self.listText, number_of_words=number_of_words, stopwords=stopwords, ngramRange=ngramRange, vocabulary=vocabulary)
443+
444+
else:
445+
df = self.__mountDataframe(labels=labels)
446+
return frequencyDonutChart(df.text.tolist(), number_of_words=number_of_words, stopwords=stopwords, ngramRange=ngramRange, vocabulary=vocabulary)

0 commit comments

Comments
 (0)