|
8 | 8 | from spatialdata import read_zarr, SpatialData |
9 | 9 |
|
10 | 10 |
|
11 | | -data_dir = "data" |
12 | | -zip_filepath = join(data_dir, "xenium_rep1_io.spatialdata.zarr.zip") |
13 | | -spatialdata_filepath = join(data_dir, "xenium_rep1_io.spatialdata.zarr") |
| 11 | +def create_xenium_filtered_points(): |
| 12 | + # 1. Download and extract the Xenium dataset if not already present |
| 13 | + data_dir = "data" |
| 14 | + zip_filepath = join(data_dir, "xenium_rep1_io.spatialdata.zarr.zip") |
| 15 | + spatialdata_filepath = join(data_dir, "xenium_rep1_io.spatialdata.zarr") |
14 | 16 |
|
15 | 17 |
|
16 | | -if not isdir(spatialdata_filepath): |
17 | | - if not isfile(zip_filepath): |
18 | | - os.makedirs(data_dir, exist_ok=True) |
19 | | - urlretrieve('https://s3.embl.de/spatialdata/spatialdata-sandbox/xenium_rep1_io.zip', zip_filepath) |
20 | | - with zipfile.ZipFile(zip_filepath, "r") as zip_ref: |
21 | | - zip_ref.extractall(data_dir) |
22 | | - os.rename(join(data_dir, "data.zarr"), spatialdata_filepath) |
| 18 | + if not isdir(spatialdata_filepath): |
| 19 | + if not isfile(zip_filepath): |
| 20 | + os.makedirs(data_dir, exist_ok=True) |
| 21 | + # zip_url = 'https://s3.embl.de/spatialdata/spatialdata-sandbox/xenium_rep1_io.zip' |
| 22 | + zip_url = 'https://s3.embl.de/spatialdata/spatialdata-sandbox/xenium_rep1_io_spatialdata_0.7.1.zip' |
| 23 | + urlretrieve(zip_url, zip_filepath) |
| 24 | + with zipfile.ZipFile(zip_filepath, "r") as zip_ref: |
| 25 | + zip_ref.extractall(data_dir) |
| 26 | + os.rename(join(data_dir, "data.zarr"), spatialdata_filepath) |
23 | 27 |
|
24 | | - # This Xenium dataset has an AnnData "raw" element. |
25 | | - # Reference: https://github.com/giovp/spatialdata-sandbox/issues/55 |
26 | | - raw_dir = join(spatialdata_filepath, "tables", "table", "raw") |
27 | | - if isdir(raw_dir): |
28 | | - shutil.rmtree(raw_dir) |
| 28 | + # This Xenium dataset has an AnnData "raw" element. |
| 29 | + # Reference: https://github.com/giovp/spatialdata-sandbox/issues/55 |
| 30 | + raw_dir = join(spatialdata_filepath, "tables", "table", "raw") |
| 31 | + if isdir(raw_dir): |
| 32 | + shutil.rmtree(raw_dir) |
29 | 33 |
|
30 | | -sdata = read_zarr(spatialdata_filepath) |
| 34 | + sdata = read_zarr(spatialdata_filepath) |
31 | 35 |
|
32 | | -ddf = sdata.points["transcripts"] |
| 36 | + ddf = sdata.points["transcripts"] |
33 | 37 |
|
34 | | -# 2. Define a function to take every 100th row from a partition |
| 38 | + # 2. Define a function to take every 100th row from a partition |
35 | 39 |
|
36 | 40 |
|
37 | | -def select_every_200th(partition): |
38 | | - # Each 'partition' is a Pandas DataFrame |
39 | | - # .iloc[::100] is the efficient pandas way to get every 100th row |
40 | | - return partition.iloc[::200] |
| 41 | + def select_every_200th(partition): |
| 42 | + # Each 'partition' is a Pandas DataFrame |
| 43 | + # .iloc[::100] is the efficient pandas way to get every 100th row |
| 44 | + return partition.iloc[::200] |
41 | 45 |
|
42 | 46 |
|
43 | | -# 3. Apply this function to every partition in the Dask DataFrame |
44 | | -result = ddf.map_partitions(select_every_200th) |
| 47 | + # 3. Apply this function to every partition in the Dask DataFrame |
| 48 | + result = ddf.map_partitions(select_every_200th) |
45 | 49 |
|
46 | | -# 4. Compute the result to see it |
47 | | -filtered_ddf = result[["x", "y", "z", "feature_name", "cell_id"]] |
| 50 | + # 4. Compute the result to see it |
| 51 | + filtered_ddf = result[["x", "y", "z", "feature_name", "cell_id"]] |
48 | 52 |
|
49 | | -small_sdata = SpatialData(points={"transcripts": filtered_ddf}) |
| 53 | + small_sdata = SpatialData(points={"transcripts": filtered_ddf}) |
50 | 54 |
|
51 | | -small_sdata.write("xenium_rep1_io.points_only.spatialdata.zarr", overwrite=True) |
| 55 | + small_sdata.write("xenium_rep1_io.points_only.spatialdata.zarr", overwrite=True) |
0 commit comments