Merge pull request #131 from awslabs/read-fixed-width

igorborgest · web-flow · commit d912021ff40d · 2020-02-09T14:12:21.000-03:00
Add Pandas.read_fwf(), Pandas.read_fwf_list() and Pandas.read_fwf_pre…
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -237,7 +237,7 @@ def _find_terminator(body, sep, quoting, quotechar, lineterminator):
 
     @staticmethod
     def _read_csv_once(session_primitives: "SessionPrimitives", bucket_name: str, key_path: str,
-                       **pd_additional_kwargs):
+                       **pd_additional_kwargs) -> pd.DataFrame:
         """
         Read a single CSV file from Amazon S3 using optimized strategies.
 
@@ -256,7 +256,7 @@ def _read_csv_once(session_primitives: "SessionPrimitives", bucket_name: str, ke
         if pd_additional_kwargs.get('compression', 'infer') == 'infer':
             pd_additional_kwargs['compression'] = infer_compression(key_path, compression='infer')
 
-        dataframe = pd.read_csv(buff, **pd_additional_kwargs)
+        dataframe: pd.DataFrame = pd.read_csv(buff, **pd_additional_kwargs)
         buff.close()
         return dataframe
 
@@ -1613,16 +1613,18 @@ def read_csv_list(
             logger.debug(f"procs_cpu_bound: {procs_cpu_bound}")
             session_primitives = self._session.primitives
             if len(paths) == 1:
-                path = paths[0]
+                path: str = paths[0]
+                bucket_name: str
+                key_path: str
                 bucket_name, key_path = Pandas._parse_path(path)
                 logger.debug(f"path: {path}")
                 df: pd.DataFrame = self._read_csv_once(session_primitives=self._session.primitives,
                                                        bucket_name=bucket_name,
                                                        key_path=key_path,
                                                        **pd_additional_kwargs)
             else:
-                procs = []
-                receive_pipes = []
+                procs: list = []
+                receive_pipes: list = []
                 logger.debug(f"len(paths): {len(paths)}")
                 for path in paths:
                     receive_pipe, send_pipe = mp.Pipe()
@@ -1639,7 +1641,7 @@ def read_csv_list(
                 dfs: List[pd.DataFrame] = []
                 for i in range(len(procs)):
                     logger.debug(f"Waiting pipe number: {i}")
-                    df_received = receive_pipes[i].recv()
+                    df_received: pd.DataFrame = receive_pipes[i].recv()
                     dfs.append(df_received)
                     logger.debug(f"Waiting proc number: {i}")
                     procs[i].join()
@@ -1689,3 +1691,128 @@ def read_csv_prefix(
                                   max_result_size=max_result_size,
                                   procs_cpu_bound=procs_cpu_bound,
                                   **pd_additional_kwargs)
+
+    @staticmethod
+    def _read_fwf(session_primitives: "SessionPrimitives", bucket_name: str, key_path: str, **pd_additional_kwargs) -> pd.DataFrame:
+        """
+        Read a single fixed-width formatted file from Amazon S3 using optimized strategies.
+
+        :param session_primitives: SessionPrimitives()
+        :param bucket_name: S3 bucket name
+        :param key_path: S3 key path (w/o bucket)
+        :param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_fwf
+        :return: Pandas Dataframe
+        """
+        buff = BytesIO()
+        session: Session = session_primitives.session
+        client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config)
+        client_s3.download_fileobj(Bucket=bucket_name, Key=key_path, Fileobj=buff)
+        buff.seek(0)
+        if pd_additional_kwargs.get('compression', 'infer') == 'infer':
+            pd_additional_kwargs['compression'] = infer_compression(key_path, compression='infer')
+        dataframe: pd.DataFrame = pd.read_fwf(buff, **pd_additional_kwargs)
+        buff.close()
+        return dataframe
+
+    @staticmethod
+    def _read_fwf_remote(send_pipe: mp.connection.Connection, session_primitives: "SessionPrimitives",
+                              bucket_name: str, key_path: str, **pd_additional_kwargs):
+        df: pd.DataFrame = Pandas._read_fwf(session_primitives=session_primitives,
+                                                 bucket_name=bucket_name,
+                                                 key_path=key_path,
+                                                 **pd_additional_kwargs)
+        send_pipe.send(df)
+        send_pipe.close()
+
+    def read_fwf(self, path: str, **pd_additional_kwargs) -> pd.DataFrame:
+        """
+        Read a single fixed-width formatted file from Amazon S3 using optimized strategies.
+
+        :param path: Amazon S3 path (e.g. s3://bucket_name/key_name)
+        :param pd_additional_kwargs: Additional parameters forwarded to pandas.read_fwf
+        :return: Pandas Dataframe
+        """
+        bucket_name, key_path = self._parse_path(path)
+        dataframe: pd.DataFrame = self._read_fwf(
+                                        session_primitives=self._session.primitives,
+                                      bucket_name=bucket_name,
+                                      key_path=key_path,
+                                      **pd_additional_kwargs)
+        return dataframe
+
+    def read_fwf_list(
+            self,
+            paths: List[str],
+            procs_cpu_bound: Optional[int] = None,
+            **pd_additional_kwargs,
+    ) -> pd.DataFrame:
+        """
+        Read a list of fixed-width formatted files from Amazon S3 using optimized strategies.
+
+        :param paths: List of Amazon S3 paths (e.g. ['s3://bucket_name/key_name1', 's3://bucket_name/key_name2'])
+        :param procs_cpu_bound: Number of cores used for CPU bound tasks
+        :param pd_additional_kwargs: Additional parameters forwarded to pandas.read_fwf
+        :return: Pandas Dataframe
+        """
+        procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else self._session.procs_cpu_bound if self._session.procs_cpu_bound is not None else 1
+        logger.debug(f"procs_cpu_bound: {procs_cpu_bound}")
+        session_primitives = self._session.primitives
+        if len(paths) == 1:
+            path: str = paths[0]
+            bucket_name: str
+            key_path: str
+            bucket_name, key_path = Pandas._parse_path(path)
+            logger.debug(f"path: {path}")
+            df: pd.DataFrame = self._read_fwf(session_primitives=self._session.primitives,
+                                                   bucket_name=bucket_name,
+                                                   key_path=key_path,
+                                                   **pd_additional_kwargs)
+        else:
+            procs: list = []
+            receive_pipes: list = []
+            logger.debug(f"len(paths): {len(paths)}")
+            for path in paths:
+                receive_pipe, send_pipe = mp.Pipe()
+                bucket_name, key_path = Pandas._parse_path(path)
+                logger.debug(f"launching path: {path}")
+                proc = mp.Process(target=self._read_fwf_remote,
+                                  args=(send_pipe, session_primitives, bucket_name, key_path),
+                                  kwargs=pd_additional_kwargs)
+                proc.daemon = False
+                proc.start()
+                procs.append(proc)
+                receive_pipes.append(receive_pipe)
+                utils.wait_process_release(processes=procs, target_number=procs_cpu_bound)
+            dfs: List[pd.DataFrame] = []
+            for i in range(len(procs)):
+                logger.debug(f"Waiting pipe number: {i}")
+                df_received: pd.DataFrame = receive_pipes[i].recv()
+                dfs.append(df_received)
+                logger.debug(f"Waiting proc number: {i}")
+                procs[i].join()
+                logger.debug(f"Closing proc number: {i}")
+                receive_pipes[i].close()
+            logger.debug(f"Concatenating all {len(paths)} DataFrames...")
+            df = pd.concat(objs=dfs, ignore_index=True, sort=False)
+            logger.debug("Concatenation done!")
+        return df
+
+    def read_fwf_prefix(
+            self,
+            path_prefix: str,
+            procs_cpu_bound: Optional[int] = None,
+            **pd_additional_kwargs,
+    ) -> pd.DataFrame:
+        """
+        Read all fixed-width formatted files from a given Amazon S3 prefix using optimized strategies.
+
+        :param path_prefix: Amazon S3 prefix (e.g. s3://bucket_name/prefix)
+        :param procs_cpu_bound: Number of cores used for CPU bound tasks
+        :param pd_additional_kwargs: Additional parameters forwarded to pandas.read_fwf
+        :return: Pandas Dataframe
+        """
+        paths: List[str] = self._session.s3.list_objects(path=path_prefix)
+        paths = [p for p in paths if not p.endswith("/")]
+        return self.read_fwf_list(paths=paths,
+                                  procs_cpu_bound=procs_cpu_bound,
+                                  **pd_additional_kwargs)
diff --git a/awswrangler/session.py b/awswrangler/session.py
@@ -1,5 +1,5 @@
-import os
 import importlib
+from os import cpu_count
 from typing import Optional, Dict
 from sys import version_info
 from logging import getLogger, Logger
@@ -95,7 +95,7 @@ def __init__(self,
         self._s3_additional_kwargs: Optional[Dict[str, str]] = s3_additional_kwargs
         self._spark_context = spark_context
         self._spark_session = spark_session
-        cpus: Optional[int] = os.cpu_count()
+        cpus: Optional[int] = cpu_count()
         self._procs_cpu_bound: int = 1 if cpus is None else cpus if procs_cpu_bound is None else procs_cpu_bound
         self._procs_io_bound: int = 1 if cpus is None else cpus * Session.PROCS_IO_BOUND_FACTOR if procs_io_bound is None else procs_io_bound
         self._athena_workgroup: str = athena_workgroup
diff --git a/data_samples/fwf_nano.txt b/data_samples/fwf_nano.txt
@@ -0,0 +1,5 @@
+1 Herfelingen27-12-18
+2   Lambusart14-06-18
+3Spormaggiore15-04-18
+4   Buizingen05-09-19
+5  San Rafael04-09-19
diff --git a/data_samples/fwf_nano.txt.zip b/data_samples/fwf_nano.txt.zip
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -2492,3 +2492,28 @@ def test_to_csv_string(bucket, database):
                                     ctas_approach=False)
     wr.s3.delete_objects(path=path)
     assert df.equals(df2)
+
+
+@pytest.mark.parametrize("sample, row_num", [
+    ("data_samples/fwf_nano.txt", 5),
+    ("data_samples/fwf_nano.txt.zip", 5)
+])
+def test_read_fwf(bucket, sample, row_num):
+    path = f"s3://{bucket}/{sample}"
+    wr.s3.delete_objects(path=f"s3://{bucket}/")
+    boto3.client("s3").upload_file(sample, bucket, sample)
+    dataframe = wr.pandas.read_fwf(path=path, widths=[1, 12, 8], names=["id", "name", "date"])
+    wr.s3.delete_objects(path=path)
+    assert len(dataframe.index) == row_num
+
+
+def test_read_fwf_prefix(bucket):
+    path = f"s3://{bucket}/data_samples/"
+    wr.s3.delete_objects(path=f"s3://{bucket}/")
+    boto3.client("s3").upload_file("data_samples/fwf_nano.txt", bucket, "data_samples/fwf_nano.txt")
+    boto3.client("s3").upload_file("data_samples/fwf_nano.txt.zip", bucket, "data_samples/fwf_nano.txt.zip")
+    sleep(10)
+    dataframe = wr.pandas.read_fwf_prefix(path_prefix=path, widths=[1, 12, 8], names=["id", "name", "date"])
+    wr.s3.delete_objects(path=path)
+    print(dataframe)
+    assert len(dataframe.index) == 10