@@ -63,12 +63,14 @@ def delete_objects(self, path):
63
63
service_name = "s3" , config = self ._session .botocore_config )
64
64
procs = []
65
65
args = {"Bucket" : bucket , "MaxKeys" : 1000 , "Prefix" : path }
66
+ LOGGER .debug (f"Arguments: \n { args } " )
66
67
next_continuation_token = True
67
68
while next_continuation_token :
68
69
res = client .list_objects_v2 (** args )
69
70
if not res .get ("Contents" ):
70
71
break
71
72
keys = [{"Key" : x .get ("Key" )} for x in res .get ("Contents" )]
73
+ LOGGER .debug (f"Number of listed keys: { len (keys )} " )
72
74
next_continuation_token = res .get ("NextContinuationToken" )
73
75
if next_continuation_token :
74
76
args ["ContinuationToken" ] = next_continuation_token
@@ -79,15 +81,25 @@ def delete_objects(self, path):
79
81
proc .daemon = False
80
82
proc .start ()
81
83
procs .append (proc )
84
+ while len (procs ) >= self ._session .procs_io_bound :
85
+ LOGGER .debug (
86
+ f"len(procs) ({ len (procs )} ) >= self._session.procs_io_bound ({ self ._session .procs_io_bound } )"
87
+ )
88
+ procs [0 ].join ()
89
+ del procs [0 ]
90
+ LOGGER .debug (f"Processes deleted from list." )
82
91
else :
92
+ LOGGER .debug (f"Starting last delete call..." )
83
93
self .delete_objects_batch (self ._session .primitives , bucket ,
84
94
keys )
95
+ LOGGER .debug (f"Waiting final processes..." )
85
96
for proc in procs :
86
97
proc .join ()
87
98
88
99
def delete_listed_objects (self , objects_paths , procs_io_bound = None ):
89
100
if not procs_io_bound :
90
101
procs_io_bound = self ._session .procs_io_bound
102
+ LOGGER .debug (f"procs_io_bound: { procs_io_bound } " )
91
103
buckets = {}
92
104
for path in objects_paths :
93
105
path_cleaned = path .replace ("s3://" , "" )
@@ -98,8 +110,11 @@ def delete_listed_objects(self, objects_paths, procs_io_bound=None):
98
110
99
111
for bucket , batch in buckets .items ():
100
112
procs = []
113
+ LOGGER .debug (f"bucket: { bucket } " )
101
114
if procs_io_bound > 1 :
115
+ LOGGER .debug (f"len(batch): { len (batch )} " )
102
116
bounders = calculate_bounders (len (batch ), procs_io_bound )
117
+ LOGGER .debug (f"bounders: { bounders } " )
103
118
for bounder in bounders :
104
119
proc = mp .Process (
105
120
target = self .delete_objects_batch ,
@@ -118,7 +133,11 @@ def delete_listed_objects(self, objects_paths, procs_io_bound=None):
118
133
for proc in procs :
119
134
proc .join ()
120
135
121
- def delete_not_listed_objects (self , objects_paths ):
136
+ def delete_not_listed_objects (self , objects_paths , procs_io_bound = None ):
137
+ if not procs_io_bound :
138
+ procs_io_bound = self ._session .procs_io_bound
139
+ LOGGER .debug (f"procs_io_bound: { procs_io_bound } " )
140
+
122
141
partitions = {}
123
142
for object_path in objects_paths :
124
143
partition_path = f"{ object_path .rsplit ('/' , 1 )[0 ]} /"
@@ -129,20 +148,35 @@ def delete_not_listed_objects(self, objects_paths):
129
148
for partition_path , batch in partitions .items ():
130
149
proc = mp .Process (
131
150
target = self .delete_not_listed_batch ,
132
- args = (self ._session .primitives , partition_path , batch ),
151
+ args = (self ._session .primitives , partition_path , batch , 1 ),
133
152
)
134
153
proc .daemon = False
135
154
proc .start ()
136
155
procs .append (proc )
156
+ while len (procs ) >= procs_io_bound :
157
+ LOGGER .debug (
158
+ f"len(procs) ({ len (procs )} ) >= procs_io_bound ({ procs_io_bound } )"
159
+ )
160
+ procs [0 ].join ()
161
+ del procs [0 ]
162
+ LOGGER .debug (f"Processes deleted from list." )
163
+ LOGGER .debug (f"Waiting final processes..." )
137
164
for proc in procs :
138
165
proc .join ()
139
166
140
167
@staticmethod
141
- def delete_not_listed_batch (session_primitives , partition_path , batch ):
168
+ def delete_not_listed_batch (session_primitives ,
169
+ partition_path ,
170
+ batch ,
171
+ procs_io_bound = None ):
142
172
session = session_primitives .session
173
+ if not procs_io_bound :
174
+ procs_io_bound = session .procs_io_bound
175
+ LOGGER .debug (f"procs_io_bound: { procs_io_bound } " )
143
176
keys = session .s3 .list_objects (path = partition_path )
144
177
dead_keys = [key for key in keys if key not in batch ]
145
- session .s3 .delete_listed_objects (objects_paths = dead_keys )
178
+ session .s3 .delete_listed_objects (objects_paths = dead_keys ,
179
+ procs_io_bound = 1 )
146
180
147
181
@staticmethod
148
182
def delete_objects_batch (session_primitives , bucket , batch ):
@@ -151,6 +185,7 @@ def delete_objects_batch(session_primitives, bucket, batch):
151
185
config = session .botocore_config )
152
186
num_requests = int (ceil ((float (len (batch )) / 1000.0 )))
153
187
bounders = calculate_bounders (len (batch ), num_requests )
188
+ LOGGER .debug (f"Bounders: { bounders } " )
154
189
for bounder in bounders :
155
190
client .delete_objects (
156
191
Bucket = bucket ,
@@ -193,25 +228,30 @@ def _get_objects_head_remote(send_pipe, session_primitives, objects_paths):
193
228
client = session .boto3_session .client (service_name = "s3" ,
194
229
config = session .botocore_config )
195
230
objects_sizes = {}
231
+ LOGGER .debug (f"len(objects_paths): { len (objects_paths )} " )
196
232
for object_path in objects_paths :
197
233
bucket , key = object_path .replace ("s3://" , "" ).split ("/" , 1 )
198
234
res = S3 ._head_object_with_retry (client = client ,
199
235
bucket = bucket ,
200
236
key = key )
201
237
size = res .get ("ContentLength" )
202
238
objects_sizes [object_path ] = size
239
+ LOGGER .debug (f"len(objects_sizes): { len (objects_sizes )} " )
203
240
send_pipe .send (objects_sizes )
204
241
send_pipe .close ()
205
242
206
243
def get_objects_sizes (self , objects_paths , procs_io_bound = None ):
207
244
if not procs_io_bound :
208
245
procs_io_bound = self ._session .procs_io_bound
246
+ LOGGER .debug (f"procs_io_bound: { procs_io_bound } " )
209
247
objects_sizes = {}
210
248
procs = []
211
249
receive_pipes = []
212
250
bounders = calculate_bounders (len (objects_paths ), procs_io_bound )
251
+ LOGGER .debug (f"len(bounders): { len (bounders )} " )
213
252
for bounder in bounders :
214
253
receive_pipe , send_pipe = mp .Pipe ()
254
+ LOGGER .debug (f"bounder: { bounder } " )
215
255
proc = mp .Process (
216
256
target = self ._get_objects_head_remote ,
217
257
args = (
@@ -224,8 +264,13 @@ def get_objects_sizes(self, objects_paths, procs_io_bound=None):
224
264
proc .start ()
225
265
procs .append (proc )
226
266
receive_pipes .append (receive_pipe )
267
+ LOGGER .debug (f"len(procs): { len (bounders )} " )
227
268
for i in range (len (procs )):
228
- objects_sizes .update (receive_pipes [i ].recv ())
269
+ LOGGER .debug (f"Waiting pipe number: { i } " )
270
+ receved = receive_pipes [i ].recv ()
271
+ objects_sizes .update (receved )
272
+ LOGGER .debug (f"Waiting proc number: { i } " )
229
273
procs [i ].join ()
274
+ LOGGER .debug (f"Closing proc number: { i } " )
230
275
receive_pipes [i ].close ()
231
276
return objects_sizes
0 commit comments