feat(cursor): Add method insert_data_bulk (#81)

Yash621 · web-flow · commit affbdef69393 · 2022-01-24T09:48:11.000-08:00
diff --git a/redshift_connector/cursor.py b/redshift_connector/cursor.py
@@ -239,6 +239,101 @@ def executemany(self: "Cursor", operation, param_sets) -> "Cursor":
         self._row_count = -1 if -1 in rowcounts else sum(rowcounts)
         return self
 
+    def insert_data_bulk(
+        self: "Cursor", filename, table_name, column_indexes, column_names, delimeter
+    ) -> "Cursor":
+
+        """runs a single bulk insert statement into the database.
+
+        This method is native to redshift_connector.
+
+         :param filename: str
+             The name of the file to read from.
+         :param table_name: str
+             The name of the table to insert to.
+         :param column_names:list
+             The name of the columns in the table to insert to.
+         :param column_indexes:list
+             The indexes of the columns in the table to insert to.
+         :param delimeter: str
+             The delimeter to use when reading the file.
+
+         Returns
+
+         -------
+         The Cursor object used for executing the specified database operation: :class:`Cursor`
+
+        """
+        if not self.__is_valid_table(table_name):
+            raise InterfaceError(
+                "Invalid table name passed to insert_data_bulk: {}".format(table_name)
+            )
+        if not self.__has_valid_columns(table_name, column_names):
+            raise InterfaceError(
+                "Invalid column names passed to insert_data_bulk: {}".format(table_name)
+            )
+        orig_paramstyle = self.paramstyle
+        import csv
+
+        if len(column_names) != len(column_indexes):
+            raise InterfaceError("Column names and indexes must be the same length")
+        sql_query = f"INSERT INTO  {table_name} ("
+        sql_query += ", ".join(column_names)
+        sql_query += ") VALUES "
+        sql_param_list_template = "(" + ", ".join(["%s"] * len(column_indexes)) + ")"
+        try:
+            with open(filename) as csv_file:
+                reader = csv.reader(csv_file, delimiter=delimeter)
+                next(reader)
+                values_list = []
+                row_count = 0
+                for row in reader:
+                    for column_index in column_indexes:
+                        values_list.append(row[column_index])
+                    row_count += 1
+                sql_param_lists = [sql_param_list_template] * row_count
+                sql_query += ", ".join(sql_param_lists) + ";"
+                self.execute(sql_query, values_list)
+        except Exception as e:
+            raise InterfaceError(e)
+        finally:
+            # reset paramstyle to it's original value
+            self.paramstyle = orig_paramstyle
+
+        return self
+
+    def __has_valid_columns(
+        self: "Cursor", table: str, columns: typing.List[str]
+    ) -> bool:
+        split_table_name: typing.List[str] = table.split(".")
+        q: str = "select 1 from information_schema.columns where table_name = ? and column_name = ?"
+        if len(split_table_name) == 2:
+            q += " and table_schema = ?"
+            param_list = [
+                [split_table_name[1], c, split_table_name[0]] for c in columns
+            ]
+        else:
+            param_list = [[split_table_name[0], c] for c in columns]
+        temp = self.paramstyle
+        self.paramstyle = "qmark"
+        try:
+            for params in param_list:
+                self.execute(q, params)
+                res = self.fetchone()
+                if typing.cast(typing.List[int], res)[0] != 1:
+                    raise InterfaceError(
+                        "Invalid column name: {} specified for table: {}".format(
+                            params[1], table
+                        )
+                    )
+        except:
+            raise
+        finally:
+            # reset paramstyle to it's original value
+            self.paramstyle = temp
+
+        return True
+
     def callproc(self, procname, parameters=None):
         args = [] if parameters is None else parameters
         operation = "CALL " + self.__sanitize_str(procname) + "(" + ", ".join(["%s" for _ in args]) + ")"
diff --git a/test/unit/test_cursor.py b/test/unit/test_cursor.py
@@ -1,6 +1,6 @@
 import typing
 from test.utils import pandas_only
-from unittest.mock import Mock, PropertyMock, patch
+from unittest.mock import Mock, PropertyMock, patch ,mock_open
 
 import pytest  # type: ignore
 
@@ -249,3 +249,114 @@ def test_get_tables_considers_args(is_single_database_metadata_val, _input, sche
     for arg in (schema_pattern, table_name_pattern):
         if arg is not None:
             assert arg in spy.call_args[0][1]
+
+
+@pytest.mark.parametrize("indexes, names", [([1], []), ([], ["c1"])])
+def test_insert_data_column_names_indexes_mismatch_raises(indexes, names, mocker):
+    # mock fetchone to return "True" to ensure the table_name and column_name
+    # validation steps pass
+    mocker.patch("redshift_connector.Cursor.fetchone", return_value=[1])
+
+    mock_cursor: Cursor = Cursor.__new__(Cursor)
+    # mock out the connection
+    mock_cursor._c = Mock()
+    mock_cursor.paramstyle = "qmark"
+
+    with pytest.raises(
+        InterfaceError, match="Column names and indexes must be the same length"
+    ):
+        mock_cursor.insert_data_bulk(
+            filename="test_file",
+            table_name="test_table",
+            column_indexes=indexes,
+            column_names=names,
+            delimeter=",",
+        )
+
+
+in_mem_csv = """\
+col1,col2,col3
+1,3,foo
+2,5,bar
+-1,7,baz"""
+
+insert_bulk_data = [
+    (
+        [0],
+        ["col1"],
+        ("INSERT INTO  test_table (col1) VALUES (%s), (%s), (%s);", ["1", "2", "-1"]),
+    ),
+    (
+        [1],
+        ["col2"],
+        ("INSERT INTO  test_table (col2) VALUES (%s), (%s), (%s);", ["3", "5", "7"]),
+    ),
+    (
+        [2],
+        ["col3"],
+        (
+            "INSERT INTO  test_table (col3) VALUES (%s), (%s), (%s);",
+            ["foo", "bar", "baz"],
+        ),
+    ),
+    (
+        [0, 1],
+        ["col1", "col2"],
+        (
+            "INSERT INTO  test_table (col1, col2) VALUES (%s, %s), (%s, %s), (%s, %s);",
+            ["1", "3", "2", "5", "-1", "7"],
+        ),
+    ),
+    (
+        [0, 2],
+        ["col1", "col3"],
+        (
+            "INSERT INTO  test_table (col1, col3) VALUES (%s, %s), (%s, %s), (%s, %s);",
+            ["1", "foo", "2", "bar", "-1", "baz"],
+        ),
+    ),
+    (
+        [1, 2],
+        ["col2", "col3"],
+        (
+            "INSERT INTO  test_table (col2, col3) VALUES (%s, %s), (%s, %s), (%s, %s);",
+            ["3", "foo", "5", "bar", "7", "baz"],
+        ),
+    ),
+    (
+        [0, 1, 2],
+        ["col1", "col2", "col3"],
+        (
+            "INSERT INTO  test_table (col1, col2, col3) VALUES (%s, %s, %s), (%s, %s, %s), (%s, %s, %s);",
+            ["1", "3", "foo", "2", "5", "bar", "-1", "7", "baz"],
+        ),
+    ),
+]
+
+
+@patch("builtins.open", new_callable=mock_open, read_data=in_mem_csv)
+@pytest.mark.parametrize("indexes,names,exp_execute_args", insert_bulk_data)
+def test_insert_data_column_stmt(mocked_csv, indexes, names, exp_execute_args, mocker):
+    # mock fetchone to return "True" to ensure the table_name and column_name
+    # validation steps pass
+    mocker.patch("redshift_connector.Cursor.fetchone", return_value=[1])
+    mock_cursor: Cursor = Cursor.__new__(Cursor)
+
+    # spy on the execute method, so we can check value of sql_query
+    spy = mocker.spy(mock_cursor, "execute")
+
+    # mock out the connection
+    mock_cursor._c = Mock()
+    mock_cursor.paramstyle = "qmark"
+
+    mock_cursor.insert_data_bulk(
+        filename="mocked_csv",
+        table_name="test_table",
+        column_indexes=indexes,
+        column_names=names,
+        delimeter=",",
+    )
+
+    assert spy.called is True
+    assert spy.call_args[0][0] == exp_execute_args[0]
+    assert spy.call_args[0][1] == exp_execute_args[1]