|
26 | 26 | from contextlib import contextmanager |
27 | 27 |
|
28 | 28 | from . import get_namespace |
29 | | -from .metadata_provider import MetaDatum |
30 | 29 | from .metaflow_config import FEAT_ALWAYS_UPLOAD_CODE_PACKAGE, MAX_ATTEMPTS, UI_URL |
31 | 30 | from .exception import ( |
32 | 31 | MetaflowException, |
|
44 | 43 | from .unbounded_foreach import ( |
45 | 44 | CONTROL_TASK_TAG, |
46 | 45 | UBF_CONTROL, |
47 | | - UBF_TASK, |
48 | 46 | ) |
49 | 47 |
|
50 | 48 | from .user_configs.config_options import ConfigInput |
@@ -103,7 +101,7 @@ def __init__( |
103 | 101 | clone_only=False, |
104 | 102 | reentrant=False, |
105 | 103 | steps_to_rerun=None, |
106 | | - step_only=False, |
| 104 | + until_steps=None, |
107 | 105 | max_workers=MAX_WORKERS, |
108 | 106 | max_num_splits=MAX_NUM_SPLITS, |
109 | 107 | max_log_size=MAX_LOG_SIZE, |
@@ -148,49 +146,31 @@ def __init__( |
148 | 146 | # If steps_to_rerun is specified, we will not clone them in resume mode. |
149 | 147 | self._steps_to_rerun = steps_to_rerun or set() |
150 | 148 | self._steps_can_clone = set() |
151 | | - self._steps_ran = set() |
152 | | - self._step_only = step_only |
| 149 | + self._steps_no_run = until_steps or set() |
| 150 | + |
153 | 151 | all_steps = set() |
154 | | - cannot_clone_steps = set(self._steps_to_rerun) |
155 | | - # sorted_nodes are in topological order already, so we only need to |
156 | | - # iterate through the nodes once to get a stable set of rerun steps. |
157 | | - # A few modes: |
158 | | - # - no steps_to_rerun: |
159 | | - # - not clone_only and not step_only: clone all previously executed steps and |
160 | | - # continue execution. |
161 | | - # - clone_only and not step_only: clone all steps that have previously executed |
162 | | - # and stop |
163 | | - # - not clone_only and step_only: NOT possible (requires a steps_to_rerun) |
164 | | - # - clone_only and step_only: NOT possible (requires a steps_to_rerun) |
165 | | - # => in all these cases, _steps_to_rerun is empty and so _steps_can_clone is |
166 | | - # all_steps |
167 | | - # - steps_to_rerun: |
168 | | - # - not clone_only and not step_only: clone all previously executed steps *except* |
169 | | - # any of the steps in steps_to_rerun and the subsequent steps. Continue execution. |
170 | | - # => _steps_to_rerun contains the steps to rerun and all descendants. _steps_can_clone |
171 | | - # contains all other steps |
172 | | - # - clone_only and not step_only: clone all steps that have previously executed |
173 | | - # up to (but not including) any of the steps in steps_to_rerun and |
174 | | - # subsequent steps. |
175 | | - # => same as above but steps_to_rerun is not used to run anything |
176 | | - # - not clone_only and step_only: clone all steps that have previously executed |
177 | | - # up to (but not including) any of the steps in steps_to_rerun and |
178 | | - # subsequent steps. Execute *only* the steps in steps_to_rerun if possible |
179 | | - # and stop. |
180 | | - # - clone_only and step_only: NOT possible (if step_only is specified, we turn |
181 | | - # off clone_only -- clone_only implies no further execution since task |
182 | | - # objects will not be generated). |
183 | | - # => _steps_to_rerun contains *only* the initially passed steps to run and |
184 | | - # _steps_can_clone contains the same as in the other cases. |
| 152 | + # If clone_only is specified, we should have no until_steps and no steps_to_rerun |
| 153 | + # so the computation below yields reruning all the steps that we previously |
| 154 | + # executed. |
| 155 | + # In the other cases, we will allow the cloning of steps up to but not |
| 156 | + # inclusive of anything in steps_to_rerun and at the end, steps_to_rerun |
| 157 | + # will contain all steps up to but not inclusive of anything in _steps_no_run. |
185 | 158 | for step_name in self._graph.sorted_nodes: |
186 | 159 | all_steps.add(step_name) |
187 | | - if step_name in cannot_clone_steps: |
188 | | - out_funcs = self._graph[step_name].out_funcs or [] |
| 160 | + out_funcs = self._graph[step_name].out_funcs or [] |
| 161 | + if step_name in self._steps_no_run: |
| 162 | + for next_step in out_funcs: |
| 163 | + self._steps_no_run.add(next_step) |
| 164 | + elif step_name in self._steps_to_rerun: |
189 | 165 | for next_step in out_funcs: |
190 | | - cannot_clone_steps.add(next_step) |
191 | | - self._steps_can_clone = all_steps - cannot_clone_steps |
192 | | - if not self._step_only: |
193 | | - self._steps_to_rerun = cannot_clone_steps |
| 166 | + # We may add things that are in steps_no_run but |
| 167 | + # we will remove them later. |
| 168 | + self._steps_to_rerun.add(next_step) |
| 169 | + self._steps_to_rerun = self._steps_to_rerun - self._steps_no_run |
| 170 | + self._steps_can_clone = all_steps - self._steps_to_rerun - self._steps_no_run |
| 171 | + print(f"steps_to_rerun: {self._steps_to_rerun}") |
| 172 | + print(f"steps_no_run: {self._steps_no_run}") |
| 173 | + print(f"steps_can_clone: {self._steps_can_clone}") |
194 | 174 |
|
195 | 175 | self._origin_ds_set = None |
196 | 176 | if clone_run_id: |
@@ -715,17 +695,19 @@ def execute(self): |
715 | 695 | system_msg=True, |
716 | 696 | ) |
717 | 697 | self._params_task.mark_resume_done() |
718 | | - elif self._step_only: |
719 | | - # Check that we ran all the steps in self._steps_to_rerun |
720 | | - steps_missing = self._steps_to_rerun - self._steps_ran |
721 | | - if steps_missing: |
722 | | - raise MetaflowInternalError( |
723 | | - "The following steps were not executed: {0}".format( |
724 | | - ", ".join(steps_missing) |
725 | | - ) |
726 | | - ) |
| 698 | + elif self._steps_no_run: |
| 699 | + # Ran a subset of the graph |
| 700 | + count_cloned = -1 # Account for _parameters task |
| 701 | + count_reexec = 0 |
| 702 | + for t in self._is_cloned.values(): |
| 703 | + if t: |
| 704 | + count_cloned += 1 |
| 705 | + else: |
| 706 | + count_reexec += 1 |
| 707 | + |
727 | 708 | self._logger( |
728 | | - "Step-only resume complete -- all specified steps were executed!", |
| 709 | + f"Partial resume complete -- cloned {count_cloned} step(s) and " |
| 710 | + f"executed {count_reexec} step(s)", |
729 | 711 | system_msg=True, |
730 | 712 | ) |
731 | 713 | else: |
@@ -1125,7 +1107,6 @@ def _queue_tasks(self, finished_tasks): |
1125 | 1107 | # finished tasks include only successful tasks |
1126 | 1108 | for task in finished_tasks: |
1127 | 1109 | step_name, _, _ = task.finished_id |
1128 | | - self._steps_ran.add(step_name) |
1129 | 1110 | self._finished[task.finished_id] = task.path |
1130 | 1111 | self._is_cloned[task.path] = task.is_cloned |
1131 | 1112 |
|
@@ -1190,11 +1171,11 @@ def _queue_tasks(self, finished_tasks): |
1190 | 1171 | ) |
1191 | 1172 | ) |
1192 | 1173 |
|
1193 | | - if self._step_only: |
| 1174 | + if self._steps_no_run: |
1194 | 1175 | # We need to filter next_steps to only include steps that are in |
1195 | 1176 | # self._steps_to_rerun |
1196 | 1177 | next_steps = [ |
1197 | | - step for step in next_steps if step in self._steps_to_rerun |
| 1178 | + step for step in next_steps if step not in self._steps_no_run |
1198 | 1179 | ] |
1199 | 1180 | if not next_steps: |
1200 | 1181 | # No steps to execute, so we can stop |
|
0 commit comments