@@ -664,12 +664,14 @@ def _read_sql_athena_regular(self,
664
664
dtype , parse_timestamps , parse_dates , converters = self ._get_query_dtype (
665
665
query_execution_id = query_execution_id )
666
666
path = f"{ s3_output } { query_execution_id } .csv"
667
+ logger .debug ("Start reading..." )
667
668
ret = self .read_csv (path = path ,
668
669
dtype = dtype ,
669
670
parse_dates = parse_timestamps ,
670
671
converters = converters ,
671
672
quoting = csv .QUOTE_ALL ,
672
673
max_result_size = max_result_size )
674
+ logger .debug ("Start type casting..." )
673
675
if max_result_size is None :
674
676
if len (ret .index ) > 0 :
675
677
for col in parse_dates :
@@ -1129,7 +1131,6 @@ def write_csv_dataframe(dataframe, path, preserve_index, compression, fs, extra_
1129
1131
elif serde == "LazySimpleSerDe" :
1130
1132
csv_extra_args ["quoting" ] = csv .QUOTE_NONE
1131
1133
csv_extra_args ["escapechar" ] = "\\ "
1132
- logger .debug (f"csv_extra_args: { csv_extra_args } " )
1133
1134
csv_buffer : bytes = bytes (
1134
1135
dataframe .to_csv (None , header = False , index = preserve_index , compression = compression , ** csv_extra_args ),
1135
1136
"utf-8" )
@@ -1360,19 +1361,19 @@ def read_parquet(self,
1360
1361
"""
1361
1362
procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else self ._session .procs_cpu_bound if self ._session .procs_cpu_bound is not None else 1
1362
1363
logger .debug (f"procs_cpu_bound: { procs_cpu_bound } " )
1363
- df : Optional [pd .DataFrame ] = None
1364
+ dfs : List [pd .DataFrame ] = []
1364
1365
session_primitives = self ._session .primitives
1365
1366
path = [path ] if type (path ) == str else path # type: ignore
1366
1367
bounders = calculate_bounders (len (path ), procs_cpu_bound )
1367
1368
logger .debug (f"len(bounders): { len (bounders )} " )
1368
1369
if len (bounders ) == 1 :
1369
- df = Pandas ._read_parquet_paths (session_primitives = session_primitives ,
1370
- path = path ,
1371
- columns = columns ,
1372
- filters = filters ,
1373
- procs_cpu_bound = procs_cpu_bound ,
1374
- wait_objects = wait_objects ,
1375
- wait_objects_timeout = wait_objects_timeout )
1370
+ dfs = Pandas ._read_parquet_paths (session_primitives = session_primitives ,
1371
+ path = path ,
1372
+ columns = columns ,
1373
+ filters = filters ,
1374
+ procs_cpu_bound = procs_cpu_bound ,
1375
+ wait_objects = wait_objects ,
1376
+ wait_objects_timeout = wait_objects_timeout )
1376
1377
else :
1377
1378
procs = []
1378
1379
receive_pipes = []
@@ -1398,15 +1399,16 @@ def read_parquet(self,
1398
1399
logger .debug (f"len(procs): { len (bounders )} " )
1399
1400
for i in range (len (procs )):
1400
1401
logger .debug (f"Waiting pipe number: { i } " )
1401
- df_received = receive_pipes [i ].recv ()
1402
- if df is None :
1403
- df = df_received
1404
- else :
1405
- df = pd .concat (objs = [df , df_received ], ignore_index = True )
1402
+ dfs_received : List [pd .DataFrame ] = receive_pipes [i ].recv ()
1403
+ dfs = dfs_received + dfs
1406
1404
logger .debug (f"Waiting proc number: { i } " )
1407
1405
procs [i ].join ()
1408
1406
logger .debug (f"Closing proc number: { i } " )
1409
1407
receive_pipes [i ].close ()
1408
+ if len (dfs ) == 1 :
1409
+ df : pd .DataFrame = dfs [0 ]
1410
+ else :
1411
+ df = pd .concat (objs = dfs , ignore_index = True )
1410
1412
return df
1411
1413
1412
1414
@staticmethod
@@ -1418,14 +1420,14 @@ def _read_parquet_paths_remote(send_pipe: mp.connection.Connection,
1418
1420
procs_cpu_bound : Optional [int ] = None ,
1419
1421
wait_objects : bool = False ,
1420
1422
wait_objects_timeout : Optional [float ] = 10.0 ):
1421
- df : pd .DataFrame = Pandas ._read_parquet_paths (session_primitives = session_primitives ,
1422
- path = path ,
1423
- columns = columns ,
1424
- filters = filters ,
1425
- procs_cpu_bound = procs_cpu_bound ,
1426
- wait_objects = wait_objects ,
1427
- wait_objects_timeout = wait_objects_timeout )
1428
- send_pipe .send (df )
1423
+ dfs : List [ pd .DataFrame ] = Pandas ._read_parquet_paths (session_primitives = session_primitives ,
1424
+ path = path ,
1425
+ columns = columns ,
1426
+ filters = filters ,
1427
+ procs_cpu_bound = procs_cpu_bound ,
1428
+ wait_objects = wait_objects ,
1429
+ wait_objects_timeout = wait_objects_timeout )
1430
+ send_pipe .send (dfs )
1429
1431
send_pipe .close ()
1430
1432
1431
1433
@staticmethod
@@ -1435,7 +1437,7 @@ def _read_parquet_paths(session_primitives: "SessionPrimitives",
1435
1437
filters : Optional [Union [List [Tuple [Any ]], List [List [Tuple [Any ]]]]] = None ,
1436
1438
procs_cpu_bound : Optional [int ] = None ,
1437
1439
wait_objects : bool = False ,
1438
- wait_objects_timeout : Optional [float ] = 10.0 ) -> pd .DataFrame :
1440
+ wait_objects_timeout : Optional [float ] = 10.0 ) -> List [ pd .DataFrame ] :
1439
1441
"""
1440
1442
Read parquet data from S3
1441
1443
@@ -1459,24 +1461,19 @@ def _read_parquet_paths(session_primitives: "SessionPrimitives",
1459
1461
procs_cpu_bound = procs_cpu_bound ,
1460
1462
wait_objects = wait_objects ,
1461
1463
wait_objects_timeout = wait_objects_timeout )
1464
+ return [df ]
1462
1465
else :
1463
- df = Pandas ._read_parquet_path (session_primitives = session_primitives ,
1464
- path = path [0 ],
1465
- columns = columns ,
1466
- filters = filters ,
1467
- procs_cpu_bound = procs_cpu_bound ,
1468
- wait_objects = wait_objects ,
1469
- wait_objects_timeout = wait_objects_timeout )
1470
- for p in path [1 :]:
1471
- df_aux = Pandas ._read_parquet_path (session_primitives = session_primitives ,
1472
- path = p ,
1473
- columns = columns ,
1474
- filters = filters ,
1475
- procs_cpu_bound = procs_cpu_bound ,
1476
- wait_objects = wait_objects ,
1477
- wait_objects_timeout = wait_objects_timeout )
1478
- df = pd .concat (objs = [df , df_aux ], ignore_index = True )
1479
- return df
1466
+ dfs : List [pd .DataFrame ] = []
1467
+ for p in path :
1468
+ df = Pandas ._read_parquet_path (session_primitives = session_primitives ,
1469
+ path = p ,
1470
+ columns = columns ,
1471
+ filters = filters ,
1472
+ procs_cpu_bound = procs_cpu_bound ,
1473
+ wait_objects = wait_objects ,
1474
+ wait_objects_timeout = wait_objects_timeout )
1475
+ dfs .append (df )
1476
+ return dfs
1480
1477
1481
1478
@staticmethod
1482
1479
def _read_parquet_path (session_primitives : "SessionPrimitives" ,
@@ -1851,17 +1848,17 @@ def read_csv_list(
1851
1848
procs .append (proc )
1852
1849
receive_pipes .append (receive_pipe )
1853
1850
utils .wait_process_release (processes = procs , target_number = procs_cpu_bound )
1851
+ dfs : List [pd .DataFrame ] = []
1854
1852
for i in range (len (procs )):
1855
1853
logger .debug (f"Waiting pipe number: { i } " )
1856
1854
df_received = receive_pipes [i ].recv ()
1857
- if df is None :
1858
- df = df_received
1859
- else :
1860
- df = pd .concat (objs = [df , df_received ], ignore_index = True )
1855
+ dfs .append (df_received )
1861
1856
logger .debug (f"Waiting proc number: { i } " )
1862
1857
procs [i ].join ()
1863
1858
logger .debug (f"Closing proc number: { i } " )
1864
1859
receive_pipes [i ].close ()
1860
+ logger .debug (f"Concatenating all { len (paths )} DataFrames..." )
1861
+ df = pd .concat (objs = dfs , ignore_index = True )
1865
1862
return df
1866
1863
1867
1864
def _read_csv_list_iterator (
0 commit comments