-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Make datachain queries atomic when exception occurs (#494)
* Make datachain queries atomic when exception occurs With this change, whenever any error or exception is raised when running the script, this will revert all datachain version and datasets created during the script. Studio PR: iterative/studio#10740 Studio Issue: iterative/studio#9875 * Use uuid to make it work with postgres * Check only current dv are created
- Loading branch information
1 parent
644dc9a
commit 414872b
Showing
5 changed files
with
152 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Set logger to debug level | ||
import logging | ||
|
||
from pydantic import BaseModel | ||
|
||
from datachain.lib.dc import C, DataChain | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
|
||
|
||
class Embedding(BaseModel): | ||
value: float | ||
|
||
|
||
ds_name = "feature_class_error" | ||
ds = ( | ||
DataChain.from_storage("gs://dvcx-datalakes/dogs-and-cats/") | ||
.filter(C("file.path").glob("*cat*.jpg")) | ||
.limit(5) | ||
.map(emd=lambda file: Embedding(value=512), output=Embedding) | ||
) | ||
ds.select("file.path", "emd.value").show(limit=5, flatten=True) | ||
ds.save(ds_name) | ||
raise Exception("This is a test exception") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import os | ||
import subprocess | ||
import sys | ||
|
||
import pytest | ||
import sqlalchemy as sa | ||
|
||
from datachain.sql.types import Float32 | ||
|
||
tests_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
python_exc = sys.executable or "python3" | ||
|
||
E2E_STEP_TIMEOUT_SEC = 90 | ||
|
||
|
||
@pytest.mark.e2e | ||
@pytest.mark.xdist_group(name="tmpfile") | ||
def test_atomicity_feature_file(tmp_dir, catalog_tmpfile): | ||
command = ( | ||
python_exc, | ||
os.path.join(tests_dir, "scripts", "feature_class_exception.py"), | ||
) | ||
if sys.platform == "win32": | ||
# Windows has a different mechanism of creating a process group. | ||
popen_args = {"creationflags": subprocess.CREATE_NEW_PROCESS_GROUP} | ||
# This is STATUS_CONTROL_C_EXIT which is equivalent to 0xC000013A | ||
else: | ||
popen_args = {"start_new_session": True} | ||
|
||
existing_dataset = catalog_tmpfile.create_dataset( | ||
"existing_dataset", | ||
query_script="script", | ||
columns=[sa.Column("similarity", Float32)], | ||
create_rows=True, | ||
) | ||
|
||
process = subprocess.Popen( # noqa: S603 | ||
command, | ||
shell=False, | ||
encoding="utf-8", | ||
env={ | ||
**os.environ, | ||
"DATACHAIN__ID_GENERATOR": catalog_tmpfile.id_generator.serialize(), | ||
"DATACHAIN__METASTORE": catalog_tmpfile.metastore.serialize(), | ||
"DATACHAIN__WAREHOUSE": catalog_tmpfile.warehouse.serialize(), | ||
}, | ||
**popen_args, | ||
) | ||
|
||
process.communicate(timeout=E2E_STEP_TIMEOUT_SEC) | ||
|
||
assert process.returncode == 1 | ||
|
||
# No datasets should be created in the catalog, but old should not be removed. | ||
dataset_versions = list(catalog_tmpfile.list_datasets_versions()) | ||
assert len(dataset_versions) == 1 | ||
assert dataset_versions[0][0].name == existing_dataset.name |