Add the donut chart function to Corpus class.

ajdavidl · ajdavidl · commit 471d3d8ed19f · 2023-05-06T00:07:25.000-03:00
diff --git a/examples.ipynb b/examples.ipynb
@@ -89,7 +89,7 @@
    "outputs": [],
    "source": [
     "mystopwords = stopwords.words('english')+['ax','edu','com','would','nntp','ac','co','gv','bf','db','tin','apr','gmt','na','pl','di','inc','gov','max','acs','cs',\n",
-    "                                         'subject','lines','organization','writes','article','one','posting','host','ca','also','too']"
+    "                                         'subject','lines','organization','writes','article','one','posting','host','ca','also','too','maxaxaxaxaxaxaxaxaxaxaxaxaxaxax']"
    ]
   },
   {
@@ -115,6 +115,13 @@
     "### Frequencies"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Matplotlib"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -142,6 +149,13 @@
     "tv.frequencyPlot(listText=df.text_clean.to_list(), ngramRange=(3,3), stopwords=mystopwords)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Plotly"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -172,6 +186,13 @@
     "fig.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Yellowbrick"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -199,6 +220,13 @@
     "tv.frequencyPlotYellowbrick(listText=df.text_clean.to_list(), ngramRange=(3,3), stopwords=mystopwords)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Tree map"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -229,6 +257,49 @@
     "fig.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Donut chart"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fig = tv.frequencyDonutChart(listText=df.text_clean.to_list(), stopwords=mystopwords)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fig = tv.frequencyDonutChart(listText=df.text_clean.to_list(), ngramRange=(2,2), stopwords=mystopwords)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fig = tv.frequencyDonutChart(listText=df.text_clean.to_list(), ngramRange=(3,3), stopwords=mystopwords)\n",
+    "fig.show()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -503,6 +574,13 @@
     "### Frequency"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Matplotlib"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -530,6 +608,13 @@
     "c.frequencyPlot(stopwords=mystopwords, labels = [\"rec.autos\",\"rec.motorcycles\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Plotly"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -557,6 +642,13 @@
     "c.frequencyPlot(stopwords=mystopwords, labels = [\"rec.autos\",\"rec.motorcycles\"], package = 'plotly')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Yellowbrick"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -584,6 +676,13 @@
     "c.frequencyPlot(stopwords=mystopwords, labels = [\"rec.autos\",\"rec.motorcycles\"], package = 'yellowbrick')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Tree Map"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -620,6 +719,46 @@
     "c.frequencyTreeMap(stopwords=mystopwords, ngramRange=(2,2), labels = [\"rec.autos\",\"rec.motorcycles\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Donut chart"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "c.frequencyDonutChart(stopwords=mystopwords)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "c.frequencyDonutChart(ngramRange=(2,2), stopwords=mystopwords)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "c.frequencyDonutChart(ngramRange=(3,3), stopwords=mystopwords)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/textvisualizer/corpus/corpus.py b/textvisualizer/corpus/corpus.py
@@ -180,7 +180,7 @@ def frequencyTreeMap(self, number_of_words=100, stopwords=None, ngramRange=(1, 1
             given, a vocabulary is determined from the input documents. Indices
             in the mapping should not be repeated and should not have any gap
             between 0 and the largest index.
-        
+
         labels : str or list of str, default=None
             Labels to be used to filter the text.
 
@@ -194,7 +194,7 @@ def frequencyTreeMap(self, number_of_words=100, stopwords=None, ngramRange=(1, 1
         else:
             df = self.__mountDataframe(labels=labels)
             return frequencyTreeMap(df.text.tolist(), number_of_words=number_of_words, stopwords=stopwords, ngramRange=ngramRange, vocabulary=vocabulary)
-        
+
     def phraseNet(self, connectors, number_of_pairs=20, labels=None, plotly=False):
         """
         Plot the Phrase net of a list of texts.
@@ -401,3 +401,46 @@ def bigramGraph(self, stopwords=None,  labels=None, total_bigrams=15):
         else:
             df = self.__mountDataframe(labels=labels)
             return bigramGraph(df.text.tolist(), stopwords=stopwords, total_bigrams=total_bigrams)
+
+    def frequencyDonutChart(self, number_of_words=20, stopwords=None, ngramRange=(1, 1), vocabulary=None, labels=None):
+        """
+        This function takes a text as input and plots a donut chart with the word frequencies using Plotly.
+
+        It uses plotly express under the hood.
+
+        Parameters
+        ----------
+        number_of_words : int
+            Number of words to be plotted.
+
+        stopwords : list of strings, default=None
+            That list is assumed to contain stop words, all of which will be removed from the resulting tokens.
+
+        ngramRange : tuple (min_n, max_n), default=(1, 1)
+            The lower and upper boundary of the range of n-values for different
+            word n-grams or char n-grams to be extracted. All values of n such
+            such that min_n <= n <= max_n will be used. For example an
+            ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means
+            unigrams and bigrams, and ``(2, 2)`` means only bigrams.
+            Only applies if ``analyzer is not callable``.
+
+        vocabulary : Mapping or iterable, default=None
+            Either a Mapping (e.g., a dict) where keys are terms and values are
+            indices in the feature matrix, or an iterable over terms. If not
+            given, a vocabulary is determined from the input documents. Indices
+            in the mapping should not be repeated and should not have any gap
+            between 0 and the largest index.
+
+        labels : str or list of str, default=None
+            Labels to be used to filter the text.
+
+        Returns
+        -------
+            plotly.graph_objs._figure.Figure
+        """
+        if labels is None:
+            return frequencyDonutChart(self.listText, number_of_words=number_of_words, stopwords=stopwords, ngramRange=ngramRange, vocabulary=vocabulary)
+
+        else:
+            df = self.__mountDataframe(labels=labels)
+            return frequencyDonutChart(df.text.tolist(), number_of_words=number_of_words, stopwords=stopwords, ngramRange=ngramRange, vocabulary=vocabulary)