refactor(cursor, description): return column label as str

Brooke-white · Brooke-white · commit 4fc0af9502e8 · 2022-06-30T16:11:22.000-07:00
diff --git a/redshift_connector/cursor.py b/redshift_connector/cursor.py
@@ -8,7 +8,11 @@
 from warnings import warn
 
 import redshift_connector
-from redshift_connector.config import ClientProtocolVersion, table_type_clauses
+from redshift_connector.config import (
+    ClientProtocolVersion,
+    _client_encoding,
+    table_type_clauses,
+)
 from redshift_connector.error import (
     MISSING_MODULE_ERROR_MSG,
     InterfaceError,
@@ -165,12 +169,17 @@ def truncated_row_desc(self: "Cursor"):
     def _getDescription(self: "Cursor") -> typing.Optional[typing.List[typing.Optional[typing.Tuple]]]:
         if self.ps is None:
             return None
-        row_desc: typing.List[typing.Dict[str, typing.Union[bytes, int, typing.Callable]]] = self.ps["row_desc"]
+        row_desc: typing.List[typing.Dict[str, typing.Union[bytes, str, int, typing.Callable]]] = self.ps["row_desc"]
         if len(row_desc) == 0:
             return None
         columns: typing.List[typing.Optional[typing.Tuple]] = []
         for col in row_desc:
-            columns.append((col["label"], col["type_oid"], None, None, None, None, None))
+            try:
+                col_name: typing.Union[str, bytes] = typing.cast(bytes, col["label"]).decode(_client_encoding)
+            except UnicodeError:
+                warn("failed to decode column name: {}, reverting to bytes".format(col["label"]))  # type: ignore
+                col_name = typing.cast(bytes, col["label"])
+            columns.append((col_name, col["type_oid"], None, None, None, None, None))
         return columns
 
     ##
@@ -503,12 +512,6 @@ def fetch_dataframe(self: "Cursor", num: typing.Optional[int] = None) -> typing.
 
         columns: typing.Optional[typing.List[typing.Union[str, bytes]]] = None
         try:
-            columns = [column[0].decode().lower() for column in self.description]
-        except UnicodeError as e:
-            warn(
-                "Unable to decode column names. Byte values will be used for pandas dataframe column labels.",
-                stacklevel=2,
-            )
             columns = [column[0].lower() for column in self.description]
         except:
             warn("No row description was found. pandas dataframe will be missing column labels.", stacklevel=2)
diff --git a/test/integration/test_cursor.py b/test/integration/test_cursor.py
@@ -0,0 +1,36 @@
+import pytest  # type: ignore
+
+import redshift_connector
+
+
+@pytest.mark.parametrize("col_name", (("apples", "apples"), ("author‎ ", "author\u200e")))
+def test_get_description(db_kwargs, col_name):
+    given_col_name, exp_col_name = col_name
+    with redshift_connector.connect(**db_kwargs) as conn:
+        with conn.cursor() as cursor:
+            cursor.execute("create temp table tmptbl({} int)".format(given_col_name))
+            cursor.execute("select * from tmptbl")
+            assert cursor.description is not None
+            assert cursor.description[0][0] == exp_col_name
+
+
+@pytest.mark.parametrize(
+    "col_names",
+    (
+        ("(c1 int, c2 int, c3 int)", ("c1", "c2", "c3")),
+        (
+            "(áppleṣ int, orañges int, passion⁘fruit int, papaya  int, bañanaș int)",
+            ("áppleṣ", "orañges", "passion⁘fruit", "papaya\u205f", "bañanaș"),
+        ),
+    ),
+)
+def test_get_description_multiple_column_names(db_kwargs, col_names):
+    given_col_names, exp_col_names = col_names
+    with redshift_connector.connect(**db_kwargs) as conn:
+        with conn.cursor() as cursor:
+            cursor.execute("create temp table tmptbl {}".format(given_col_names))
+            cursor.execute("select * from tmptbl")
+            assert cursor.description is not None
+
+            for cidx, column in enumerate(cursor.description):
+                assert column[0] == exp_col_names[cidx]
diff --git a/test/integration/test_dbapi20.py b/test/integration/test_dbapi20.py
@@ -110,7 +110,7 @@ def test_description(con):
     cur.execute("select name from %sbooze" % table_prefix)
     assert len(cur.description) == 1, "cursor.description describes too many columns"
     assert len(cur.description[0]) == 7, "cursor.description[x] tuples must have 7 elements"
-    assert cur.description[0][0].lower() == b"name", "cursor.description[x][0] must return column name"
+    assert cur.description[0][0].lower() == "name", "cursor.description[x][0] must return column name"
     assert cur.description[0][1] == driver.STRING, (
         "cursor.description[x][1] must return column type. Got %r" % cur.description[0][1]
     )
diff --git a/test/integration/test_pandas.py b/test/integration/test_pandas.py
@@ -54,6 +54,7 @@ def test_fetch_dataframe(db_table):
         cursor.execute("select * from book; ")
         result = cursor.fetch_dataframe()
         assert result.columns[0] == "bookname"
+        assert result.columns[1] == "author\u200e"
 
 
 @pandas_only
diff --git a/test/unit/test_cursor.py b/test/unit/test_cursor.py
@@ -11,14 +11,27 @@
 IS_SINGLE_DATABASE_METADATA_TOGGLE: typing.List[bool] = [True, False]
 
 
-test_warn_response_data: typing.List[typing.Tuple[typing.Optional[typing.List[bytes]], str]] = [
-    ([b"ab\xffcd"], "Unable to decode column names. Byte values will be used for pandas dataframe column labels."),
+description_warn_response_data: typing.List[typing.Tuple[bytes, str]] = [
+    (b"ab\xffcd", "failed to decode column name"),
+]
+
+
+@pytest.mark.parametrize("_input", description_warn_response_data)
+def test_get_description_warns_user(_input):
+    data, exp_warning_msg = _input
+    mock_cursor: Cursor = Cursor.__new__(Cursor)
+    mock_cursor.__setattr__("ps", {"row_desc": [{"type_oid": 1043, "label": data, "column_name": b"c1"}]})
+    with pytest.warns(UserWarning, match=exp_warning_msg):
+        mock_cursor.description
+
+
+fetch_df_warn_response_data: typing.List[typing.Tuple[typing.Optional[typing.List[bytes]], str]] = [
     (None, "No row description was found. pandas dataframe will be missing column labels."),
 ]
 
 
 @pandas_only
-@pytest.mark.parametrize("_input", test_warn_response_data)
+@pytest.mark.parametrize("_input", fetch_df_warn_response_data)
 def test_fetch_dataframe_warns_user(_input, mocker):
     data, exp_warning_msg = _input
     mock_cursor: Cursor = Cursor.__new__(Cursor)

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ def test_description(con):`
`110`	`110`	`cur.execute("select name from %sbooze" % table_prefix)`
`111`	`111`	`assert len(cur.description) == 1, "cursor.description describes too many columns"`
`112`	`112`	`assert len(cur.description[0]) == 7, "cursor.description[x] tuples must have 7 elements"`
`113`		`- assert cur.description[0][0].lower() == b"name", "cursor.description[x][0] must return column name"`
	`113`	`+ assert cur.description[0][0].lower() == "name", "cursor.description[x][0] must return column name"`
`114`	`114`	`assert cur.description[0][1] == driver.STRING, (`
`115`	`115`	`"cursor.description[x][1] must return column type. Got %r" % cur.description[0][1]`
`116`	`116`	`)`