Skip to content

Commit 78879e2

Browse files
authored
fix: Widen system test timeout, handle tearing down failed training pipelines (#791)
* Widen system test timeout to 5 hours * Catch training pipeline failures in teardown
1 parent d8da2e3 commit 78879e2

File tree

2 files changed

+24
-14
lines changed

2 files changed

+24
-14
lines changed

Diff for: .kokoro/continuous/system.cfg

+3
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@ env_vars: {
1010
key: "PYTEST_ADDOPTS"
1111
value: "-n=auto --dist=loadscope"
1212
}
13+
14+
# Kokoro VM timeout of 5 hours for system tests
15+
timeout_mins: 300

Diff for: samples/snippets/conftest.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import os
1616
from uuid import uuid4
1717

18+
from google.api_core import exceptions
19+
1820
from google.cloud import aiplatform, aiplatform_v1beta1
1921
from google.cloud import bigquery
2022
from google.cloud import storage
@@ -175,22 +177,27 @@ def teardown_hyperparameter_tuning_job(shared_state, job_client):
175177
def teardown_training_pipeline(shared_state, pipeline_client):
176178
yield
177179

178-
pipeline_client.cancel_training_pipeline(
179-
name=shared_state["training_pipeline_name"]
180-
)
180+
try:
181+
pipeline_client.cancel_training_pipeline(
182+
name=shared_state["training_pipeline_name"]
183+
)
181184

182-
# Waiting for training pipeline to be in CANCELLED state
183-
timeout = shared_state["cancel_batch_prediction_job_timeout"]
184-
helpers.wait_for_job_state(
185-
get_job_method=pipeline_client.get_training_pipeline,
186-
name=shared_state["training_pipeline_name"],
187-
timeout=timeout,
188-
)
185+
# Waiting for training pipeline to be in CANCELLED state
186+
timeout = shared_state["cancel_batch_prediction_job_timeout"]
187+
helpers.wait_for_job_state(
188+
get_job_method=pipeline_client.get_training_pipeline,
189+
name=shared_state["training_pipeline_name"],
190+
timeout=timeout,
191+
)
189192

190-
# Delete the training pipeline
191-
pipeline_client.delete_training_pipeline(
192-
name=shared_state["training_pipeline_name"]
193-
)
193+
except exceptions.FailedPrecondition:
194+
pass # If pipeline failed, ignore and skip directly to deletion
195+
196+
finally:
197+
# Delete the training pipeline
198+
pipeline_client.delete_training_pipeline(
199+
name=shared_state["training_pipeline_name"]
200+
)
194201

195202

196203
@pytest.fixture()

0 commit comments

Comments
 (0)