Skip to content

Commit 9422ebb

Browse files
authored
Make read_many return path as Char (#13475)
Fixes #13069
1 parent 5f333b7 commit 9422ebb

File tree

5 files changed

+110
-76
lines changed

5 files changed

+110
-76
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
- [`Meta.meta` recognizes functions as `Meta.Function`][13443]
7272
- [`Meta.Unresolved_Symbol` renamed to `Meta.Unresolved`][13443]
7373
- [Remane Google_Api library to Google][13436]
74+
- [Data.read_many now returns the read path as a char field][13475]
7475

7576
[12726]: https://github.com/enso-org/enso/pull/12726
7677
[12950]: https://github.com/enso-org/enso/pull/12950
@@ -84,6 +85,7 @@
8485
[13415]: https://github.com/enso-org/enso/pull/13415
8586
[13415]: https://github.com/enso-org/enso/pull/13443
8687
[13436]: https://github.com/enso-org/enso/pull/13436
88+
[13475]: https://github.com/enso-org/enso/pull/13475
8789

8890
#### Enso Language & Runtime
8991

distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Read_Many_Helpers.enso

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,7 @@ import project.Value_Type.Value_Type
1717
from project.Internal.Table_Helpers import duplicate_rows
1818

1919
find_files_list_in_table (that : Table) -> Many_Files_List =
20-
found_column = if that.column_count == 1 then that.at 0 else
21-
path_columns = that.select_columns "path" case_sensitivity=..Insensitive on_problems=..Report_Error
22-
not_found = path_columns.is_error || (path_columns.column_count == 0)
23-
if not_found then Error.throw (Illegal_Argument.Error "To use a Table as file list, it must be a single column or contain a `path` column (case insensitive).") else
24-
if path_columns.column_count > 1 then Error.throw (Illegal_Argument.Error "Multiple 'paths' column candidates found: "+path_columns.column_names.to_display_text+".") else
25-
path_columns.at 0
20+
found_column = _find_path_column_in_table that
2621
ensure_column_type_valid_to_be_files_list found_column <|
2722
Many_Files_List.Value that found_column.to_vector
2823

@@ -40,15 +35,18 @@ ensure_column_type_valid_to_be_files_list (column : Column) ~action =
4035

4136
make_return (return_shape : Return_As_Table) (input : Many_Files_List) (objects : Vector (Any | Failed_To_Load)) (on_problems : Problem_Behavior) -> Table =
4237
base_table = _input_as_table input
38+
path_column = _find_path_column_in_table base_table . map .to_text
39+
typed_base_table = base_table.set path_column
40+
4341
case return_shape of
4442
Return_As_Table.With_New_Column ->
4543
replaced = replace_with_nothing_and_propagate objects on_problems
46-
_add_objects_column base_table replaced
44+
_add_objects_column typed_base_table replaced
4745
Return_As_Table.As_Merged_Table columns_to_keep match_columns ->
4846
tables = input.paths_to_load.zip objects path-> object->
4947
strategy = Read_Many_As_Merged_Table_Strategy.from object
5048
strategy.into_table path columns_to_keep match_columns on_problems
51-
_merge_input_and_tables base_table tables columns_to_keep match_columns on_problems
49+
_merge_input_and_tables typed_base_table tables columns_to_keep match_columns on_problems
5250

5351
_input_as_table (input : Many_Files_List) -> Table =
5452
case input.original_value of
@@ -113,6 +111,14 @@ _add_objects_column (base_table : Table) (objects : Vector Any) -> Table =
113111
objects_column_name = unique_naming.make_unique "Value"
114112
base_table.set (Column.from_vector objects_column_name objects) as=objects_column_name set_mode=..Add
115113

114+
_find_path_column_in_table (that : Table) -> Column =
115+
if that.column_count == 1 then that.at 0 else
116+
path_columns = that.select_columns "path" case_sensitivity=..Insensitive on_problems=..Report_Error
117+
not_found = path_columns.is_error || (path_columns.column_count == 0)
118+
if not_found then Error.throw (Illegal_Argument.Error "To use a Table as file list, it must be a single column or contain a `path` column (case insensitive).") else
119+
if path_columns.column_count > 1 then Error.throw (Illegal_Argument.Error "Multiple 'paths' column candidates found: "+path_columns.column_names.to_display_text+".") else
120+
path_columns.at 0
121+
116122
## Workaround for bug https://github.com/enso-org/enso/issues/11570
117123
TODO: Remove workaround one #11570 is closed.
118124
private _inherit_warnings_from_vector vector:Vector ~action =

test/Table_Tests/src/IO/Excel_Spec.enso

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,15 +1081,15 @@ add_specs suite_builder =
10811081
r2.column_names . should_equal ["Path", "Sheet Name", "A", "B", "C"]
10821082

10831083
# We transform the Path to just file name for easier testing
1084-
rows = (r2.set (r2.at "Path" . map .name) "Path").rows.map .to_vector
1085-
rows.at 0 . should_equal ["1.tsv", Nothing, 1, 3, Nothing]
1086-
rows.at 1 . should_equal ["1.tsv", Nothing, 2, 4, Nothing]
1087-
rows.at 2 . should_equal ["2.xlsx", "EnsoSheet", 10, 30, Nothing]
1088-
rows.at 3 . should_equal ["2.xlsx", "EnsoSheet", 20, 40, Nothing]
1089-
rows.at 4 . should_equal ["3.xlsx", "nr 1", 100, 200, 300]
1090-
rows.at 5 . should_equal ["3.xlsx", "nr 2", 400, Nothing, Nothing]
1091-
rows.at 6 . should_equal ["3.xlsx", "nr 2", 500, Nothing, Nothing]
1092-
rows.at 7 . should_equal ["3.xlsx", "nr 2", 600, Nothing, Nothing]
1084+
rows = r2.rows.map .to_vector
1085+
rows.at 0 . should_equal [(base_dir / "1.tsv") . path, Nothing, 1, 3, Nothing]
1086+
rows.at 1 . should_equal [(base_dir / "1.tsv") . path, Nothing, 2, 4, Nothing]
1087+
rows.at 2 . should_equal [(base_dir / "2.xlsx") . path, "EnsoSheet", 10, 30, Nothing]
1088+
rows.at 3 . should_equal [(base_dir / "2.xlsx") . path, "EnsoSheet", 20, 40, Nothing]
1089+
rows.at 4 . should_equal [(base_dir / "3.xlsx") . path, "nr 1", 100, 200, 300]
1090+
rows.at 5 . should_equal [(base_dir / "3.xlsx") . path, "nr 2", 400, Nothing, Nothing]
1091+
rows.at 6 . should_equal [(base_dir / "3.xlsx") . path, "nr 2", 500, Nothing, Nothing]
1092+
rows.at 7 . should_equal [(base_dir / "3.xlsx") . path, "nr 2", 600, Nothing, Nothing]
10931093

10941094
# Test loading only Excel files and alternate matching mode to weed out edge cases
10951095
r3 = Data.read_many (Data.list base_dir name_filter="*.xlsx" . sort on=(.name)) return=..As_Merged_Table
@@ -1115,56 +1115,56 @@ add_specs suite_builder =
11151115
r1.should_be_a Table
11161116
within_table r1 <|
11171117
# We transform the Path to just file name for easier testing
1118-
rows = (r1.set (r1.at "Path" . map .name) "Path").rows.map .to_vector
1118+
rows = r1.rows.map .to_vector
11191119

11201120
# Each Sheet Name column comes out as separate: 1 - input, 2 - metadata, 3 - data
11211121
# The order of columns is as they appear in the input, and they are matched by name
11221122
r1.column_names . should_equal [ "Path", "Sheet Name", "Sheet Name 1", "Z", "Sheet Name 2", "X", "Y"]
1123-
rows.at 0 . should_equal [ "1.tsv", "input 1", Nothing, 1, "data column", Nothing, Nothing]
1124-
rows.at 1 . should_equal [ "1.tsv", "input 1", Nothing, 2, "data column", Nothing, Nothing]
1125-
rows.at 2 . should_equal ["2.xlsx", "input 2", "EnsoSheet", 10, Nothing, 20, Nothing]
1126-
rows.at 3 . should_equal ["3.xlsx", "input 3", "nr 1", 300, Nothing, 100, 200]
1127-
rows.at 4 . should_equal ["3.xlsx", "input 3", "nr 2", Nothing, "400", Nothing, Nothing]
1128-
rows.at 5 . should_equal ["3.xlsx", "input 3", "nr 2", Nothing, "500", Nothing, Nothing]
1129-
rows.at 6 . should_equal ["3.xlsx", "input 3", "nr 2", Nothing, "600", Nothing, Nothing]
1123+
rows.at 0 . should_equal [(base_dir / "1.tsv").path, "input 1", Nothing, 1, "data column", Nothing, Nothing]
1124+
rows.at 1 . should_equal [(base_dir / "1.tsv").path, "input 1", Nothing, 2, "data column", Nothing, Nothing]
1125+
rows.at 2 . should_equal [(base_dir / "2.xlsx").path, "input 2", "EnsoSheet", 10, Nothing, 20, Nothing]
1126+
rows.at 3 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 1", 300, Nothing, 100, 200]
1127+
rows.at 4 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", Nothing, "400", Nothing, Nothing]
1128+
rows.at 5 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", Nothing, "500", Nothing, Nothing]
1129+
rows.at 6 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", Nothing, "600", Nothing, Nothing]
11301130

11311131
Problems.expect_warning Duplicate_Output_Column_Names r1
11321132
Problems.expect_warning No_Common_Type r1
11331133

11341134
r2 = Data.read_many input return=(..As_Merged_Table match=..By_Position)
11351135
r2.should_be_a Table
11361136
within_table r2 <|
1137-
rows = (r2.set (r2.at "Path" . map .name) "Path").rows.map .to_vector
1137+
rows = r2.rows.map .to_vector
11381138

11391139
# Two Sheet Name column comes out as separate: 1 - input, 2 - metadata, the third one (data) gets renamed due to positional matching
11401140
# The column names come from the first table that had all the columns - in this case, first sheet of 3.xlsx
11411141
r2.column_names . should_equal [ "Path", "Sheet Name", "Sheet Name 1", "X", "Y", "Z"]
1142-
rows.at 0 . should_equal [ "1.tsv", "input 1", Nothing, 1, "data column", Nothing]
1143-
rows.at 1 . should_equal [ "1.tsv", "input 1", Nothing, 2, "data column", Nothing]
1144-
rows.at 2 . should_equal ["2.xlsx", "input 2", "EnsoSheet", 10, "20", Nothing]
1145-
rows.at 3 . should_equal ["3.xlsx", "input 3", "nr 1", 100, "200", 300]
1146-
rows.at 4 . should_equal ["3.xlsx", "input 3", "nr 2", 400, Nothing, Nothing]
1147-
rows.at 5 . should_equal ["3.xlsx", "input 3", "nr 2", 500, Nothing, Nothing]
1148-
rows.at 6 . should_equal ["3.xlsx", "input 3", "nr 2", 600, Nothing, Nothing]
1142+
rows.at 0 . should_equal [(base_dir / "1.tsv").path, "input 1", Nothing, 1, "data column", Nothing]
1143+
rows.at 1 . should_equal [(base_dir / "1.tsv").path, "input 1", Nothing, 2, "data column", Nothing]
1144+
rows.at 2 . should_equal [(base_dir / "2.xlsx").path, "input 2", "EnsoSheet", 10, "20", Nothing]
1145+
rows.at 3 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 1", 100, "200", 300]
1146+
rows.at 4 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", 400, Nothing, Nothing]
1147+
rows.at 5 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", 500, Nothing, Nothing]
1148+
rows.at 6 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", 600, Nothing, Nothing]
11491149

11501150
Problems.expect_warning Duplicate_Output_Column_Names r2
11511151
Problems.expect_warning No_Common_Type r2
11521152

11531153
r3 = Data.read_many input return=(..As_Merged_Table columns_to_keep=..In_All match=..By_Position)
11541154
r3.should_be_a Table
11551155
within_table r3 <|
1156-
rows = (r3.set (r3.at "Path" . map .name) "Path").rows.map .to_vector
1156+
rows = r3.rows.map .to_vector
11571157

11581158
# Same as with `r2`, but now we keep only columns that are present in all tables, then the column names come from the first table (so we get column Z).
11591159
# But the `Sheet Name` metadata column is still kept, as its matching is independent of data.
11601160
r3.column_names . should_equal [ "Path", "Sheet Name", "Sheet Name 1", "Z"]
1161-
rows.at 0 . should_equal [ "1.tsv", "input 1", Nothing, 1]
1162-
rows.at 1 . should_equal [ "1.tsv", "input 1", Nothing, 2]
1163-
rows.at 2 . should_equal ["2.xlsx", "input 2", "EnsoSheet", 10]
1164-
rows.at 3 . should_equal ["3.xlsx", "input 3", "nr 1", 100]
1165-
rows.at 4 . should_equal ["3.xlsx", "input 3", "nr 2", 400]
1166-
rows.at 5 . should_equal ["3.xlsx", "input 3", "nr 2", 500]
1167-
rows.at 6 . should_equal ["3.xlsx", "input 3", "nr 2", 600]
1161+
rows.at 0 . should_equal [(base_dir / "1.tsv").path, "input 1", Nothing, 1]
1162+
rows.at 1 . should_equal [(base_dir / "1.tsv").path, "input 1", Nothing, 2]
1163+
rows.at 2 . should_equal [(base_dir / "2.xlsx").path, "input 2", "EnsoSheet", 10]
1164+
rows.at 3 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 1", 100]
1165+
rows.at 4 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", 400]
1166+
rows.at 5 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", 500]
1167+
rows.at 6 . should_equal [(base_dir / "3.xlsx").path, "input 3", "nr 2", 600]
11681168

11691169
Problems.expect_warning Duplicate_Output_Column_Names r3
11701170
Problems.expect_warning Column_Count_Mismatch r3
@@ -1180,8 +1180,8 @@ add_specs suite_builder =
11801180
r.row_count . should_equal 2+25
11811181
r.column_names . should_equal ["Path", "Sheet Name", "A", "B", "Name", "Quantity", "Price", "C", "D", "E", "Student Name", "Enrolment Date", "Item", "Price 1"]
11821182
# First two rows come from TSV, the rest from Excel sheets
1183-
r.at "Path" . to_vector . map .name . should_equal <|
1184-
(Vector.fill 2 tsv_file.name) + (Vector.fill 25 xls_file.name)
1183+
r.at "Path" . to_vector . should_equal <|
1184+
(Vector.fill 2 tsv_file.path) + (Vector.fill 25 xls_file.path)
11851185
r.at "Sheet Name" . to_vector . should_equal <|
11861186
(Vector.fill 2 Nothing) + (Vector.fill 6 "Simple") + (Vector.fill 7 "Strange Dimensions") + (Vector.fill 6 "Dates") + (Vector.fill 6 "Duplicate Columns")
11871187

@@ -1203,7 +1203,7 @@ add_specs suite_builder =
12031203
# But when not expanding rows, the workbook with all-empty sheets is normally loaded into a cell
12041204
r3 = Data.read_many [tsv_file, empty_xls_file] return=..With_New_Column
12051205
r3.should_be_a Table
1206-
r3.at "Path" . to_vector . map .name . should_equal [tsv_file.name, empty_xls_file.name]
1206+
r3.at "Path" . to_vector . should_equal [tsv_file.path, empty_xls_file.path]
12071207
r3.at "Value" . at 0 . should_be_a Table
12081208
r3.at "Value" . at 1 . should_be_a Excel_Workbook
12091209

0 commit comments

Comments
 (0)