diff --git a/verticapy/tests/utilities/test_utilities.py b/verticapy/tests/utilities/test_utilities.py index 201572789..970c71dbe 100755 --- a/verticapy/tests/utilities/test_utilities.py +++ b/verticapy/tests/utilities/test_utilities.py @@ -476,7 +476,7 @@ def test_pjson(self): "recordid": "Varchar(80)", } - def test_read_json(self): + def test_read_json(self, laliga_vd): drop("public.titanic_verticapy_test_json", method="table") path = os.path.dirname(verticapy.__file__) + "/tests/utilities/" result = read_json( @@ -496,97 +496,117 @@ def test_read_json(self): ) assert result.shape() == (1782, 15) assert drop("v_temp_schema.titanic_verticapy_test_json", method="table") + + """ + # doing an ingest_local = False does not work yet - path = os.path.dirname(verticapy.__file__) + "/data/laliga/" - #TO DO - # testing for multiple files - # Not working: ERROR, Message: Failed to read json source []: Read error when expanding glob: ""it works locally"" - #drop("public.laliga_verticapy_test_json", method="table") - #vdf = read_json( - # path + "*.json", - # table_name="laliga_verticapy_test_json", - # schema="public", - # ingest_local=False, - # use_complex_dt=True, - #) - #assert vdf.shape()==(452,14) + # TODO test with ingest_local = False + + # use complex dt + laliga_vd.to_json("/home/dbadmin/laliga/", n_files=5, order_by="match_id") + path = "/home/dbadmin/laliga/*.json" + drop("public.laliga_verticapy_test_json", method="table") + vdf = read_json( + path, + table_name="laliga_verticapy_test_json", + schema="public", + ingest_local=False, + use_complex_dt=True, + ) + assert vdf.shape() == (452, 14) + """ # Trying SQL - + path = os.path.dirname(verticapy.__file__) + "/data/laliga/*.json" drop("public.laliga_verticapy_test_json", method="table") queries = read_json( - path + "2005.json", + path, table_name="laliga_verticapy_test_json", schema="public", genSQL=True, - ingest_local=False, - use_complex_dt=True, + ingest_local=True, + use_complex_dt=False, ) for query in queries: - current_cursor().execute(query) + current_cursor().execute( + query.replace("tmp_flex_dbadmin", "tmp_flex_test_read_json") + ) vdf = vDataFrame("public.laliga_verticapy_test_json") - assert vdf.shape()==(17,14) + assert vdf.shape() == (452, 40) assert vdf["away_score"].ctype().lower()[0:3] == "int" - assert vdf["away_team"]["away_team_id"].ctype().lower()[0:3] == "int" - assert vdf["match_status"].ctype().lower() == "varchar(80)" - assert vdf["away_team"]["away_team_gender"] == "varchar(80)" - assert not(isflextable(table_name="laliga_verticapy_test_json", schema="public")) + assert vdf["away_team.away_team_id"].ctype().lower()[0:3] == "int" + assert vdf["match_status"].ctype().lower() == "varchar(20)" + assert vdf["away_team.away_team_gender"].ctype().lower() == "varchar(20)" + assert not ( + isflextable(table_name="laliga_verticapy_test_json", schema="public") + ) + """ + -- TO DO, tests on insert! - it seems to not work well # testing insert - vdf=read_json( - path + "2005.json", + vdf = read_json( + path, table_name="laliga_verticapy_test_json", schema="public", insert=True, ingest_local=False, - use_complex_dt=True, + use_complex_dt=False, ) - assert vdf.shape()==(34,14) - + assert vdf.shape() == (904, 40) + """ # testing temporary table drop("public.laliga_verticapy_test_json", method="table") vdf = read_json( - path + "2005.json", + path, + table_name="laliga_verticapy_test_json", + schema="public", temporary_table=True, - ingest_local=False, - use_complex_dt=True, + ingest_local=True, + use_complex_dt=False, ) - assert vdf._VERTICAPY_VARIABLES_["schema"]=='v_temp_schema' - assert drop(vdf._VERTICAPY_VARIABLES_["schema"]+"."+vdf._VERTICAPY_VARIABLES_["input_relation"],method="table") + assert vdf._VERTICAPY_VARIABLES_["schema"] == "public" + assert drop("public.laliga_verticapy_test_json", method="table",) # testing local temporary table vdf = read_json( - path + "2005.json", + path, + table_name="laliga_verticapy_test_json2", temporary_local_table=True, - ingest_local=False, - use_complex_dt=True, + ingest_local=True, + use_complex_dt=False, + ) + assert vdf._VERTICAPY_VARIABLES_["schema"] == "v_temp_schema" + assert drop("v_temp_schema.laliga_verticapy_test_json2", method="table",) + + # Checking flextables and materialize option + path = os.path.dirname(verticapy.__file__) + "/tests/utilities/" + drop("public.titanic_verticapy_test_json") + result = read_json( + path + "titanic-passengers.json", + table_name="titanic_verticapy_test_json", + schema="public", + ingest_local=True, + materialize=False, ) - assert vdf._VERTICAPY_VARIABLES_["schema"]=='v_temp_schema' - assert drop(vdf._VERTICAPY_VARIABLES_["schema"]+"."+vdf._VERTICAPY_VARIABLES_["input_relation"],method="table") - - # Checking flextables and materialize option - # Not working right now - #path = os.path.dirname(verticapy.__file__) + "/tests/utilities/" - #drop("public.titanic_verticapy_test_json") - #result = read_json( - # path+"titanic-passengers.json", - # table_name="titanic_verticapy_test_json", - # schema="public", - # ingest_local=True, - # materialize=False, - # ) - #assert isflextable(table_name="titanic_verticapy_test_json",schema="public")==True - # + assert isflextable(table_name="titanic_verticapy_test_json", schema="public") + # Checking materialize, storing to database, and re-conversion to a vdataframe - #drop("public.titanic_verticapy_test_json_2") - #result.to_db('"public"."titanic_verticapy_test_json_2"') - #result2=vDataFrame("public.titanic_verticapy_test_json_2") - #assert result2["fields.cabin"].dtype()==result["fields.cabin"].dtype() - #assert result2["fields.age"].dtype()==result["fields.age"].dtype() - #assert result2["datasetid"].dtype()==result["datasetid"].dtype() - #assert result2["fields.fare"].dtype()==result["fields.fare"].dtype() - #assert result2["fields.parch"].dtype()[0:3]==result["fields.fare"].dtype()[0:3] - #assert result2["fields.pclass"].dtype()[0:3]==result["fields.pclass"].dtype()[0:3] + drop("public.titanic_verticapy_test_json_2") + result.to_db("public.titanic_verticapy_test_json_2") + result2 = vDataFrame("public.titanic_verticapy_test_json_2") + assert result2["fields.cabin"].dtype() == result["fields.cabin"].dtype() + assert result2["fields.age"].dtype() == result["fields.age"].dtype() + assert result2["datasetid"].dtype() == result["datasetid"].dtype() + assert result2["fields.fare"].dtype() == result["fields.fare"].dtype() + assert ( + result2["fields.parch"].dtype()[0:3] == result["fields.parch"].dtype()[0:3] + ) + assert ( + result2["fields.pclass"].dtype()[0:3] + == result["fields.pclass"].dtype()[0:3] + ) + assert drop("public.titanic_verticapy_test_json") + drop("public.titanic_verticapy_test_json_2") def test_read_csv(self): path = os.path.dirname(verticapy.__file__) + "/data/titanic.csv" @@ -669,35 +689,44 @@ def test_read_csv(self): drop("v_temp_schema.titanic_verticapy_test_csv", method="table") # Checking Flextable + path = os.path.dirname(verticapy.__file__) + "/data/" drop("public.titanic_verticapy_test_csv") result = read_csv( - path="titanic.csv", table_name="titanic_verticapy_test_csv",materialize=False,ingest_local=True, schema="public" - ) - assert isflextable(table_name="titanic_verticapy_test_csv",schema="public")==True + path=path + "titanic.csv", + table_name="titanic_verticapy_test_csv", + materialize=False, + ingest_local=True, + schema="public", + ) + assert isflextable(table_name="titanic_verticapy_test_csv", schema="public") # Checking materialize, storing to database, and re-conversion to a vdataframe drop("public.titanic_verticapy_test_csv_2") result.to_db('"public"."titanic_verticapy_test_csv_2"') - result2=vDataFrame("public.titanic_verticapy_test_csv_2") - assert result2["ticket"].dtype()==result["ticket"].dtype() - assert result2["survived"].dtype()[0:3]==result["survived"].dtype()[0:3] - assert result2["sibsp"].dtype()[0:3]==result["sibsp"].dtype()[0:3] - assert result2["pclass"].dtype()[0:3]==result["pclass"].dtype()[0:3] - assert result2["home.dest"].dtype()==result["home.dest"].dtype() - - # TODO - #drop("public.titanic_verticapy_test_csv_gz") - #result = read_csv( - # path+"titanic.csv.gz", table_name="titanic_verticapy_test_csv_gz",ingest_local=False, schema="public" - # ) - #assert result.shape() == (1234, 14) - - @pytest.mark.skip( - reason="for some reason, it can not read the file. It works when we do it locally." - ) + result2 = vDataFrame("public.titanic_verticapy_test_csv_2") + assert result2["ticket"].dtype() == result["ticket"].dtype() + assert result2["survived"].dtype()[0:3] == result["survived"].dtype()[0:3] + assert result2["sibsp"].dtype()[0:3] == result["sibsp"].dtype()[0:3] + assert result2["pclass"].dtype()[0:3] == result["pclass"].dtype()[0:3] + assert result2["home.dest"].dtype() == result["home.dest"].dtype() + + # with compression + path = os.path.dirname(verticapy.__file__) + "/tests/utilities/titanic.csv.gz" + drop("public.titanic_verticapy_test_csv_gz") + result3 = read_csv( + path, + table_name="titanic_verticapy_test_csv_gz", + ingest_local=True, + schema="public", + header_names=[col[1:-1] for col in result2.get_columns()], + ) + assert result3.shape() == (1234, 14) + + @pytest.mark.skip(reason="can not read files locally.") def test_read_file(self, laliga_vd): + laliga_vd.to_json("/home/dbadmin/laliga/", n_files=5, order_by="match_id") + path = "/home/dbadmin/laliga/*.json" drop(name="v_temp_schema.laliga_test") - path = os.path.dirname(verticapy.__file__) + "/data/laliga/*.json" vdf = read_file( path=path, schema="", @@ -742,17 +771,12 @@ def test_read_file(self, laliga_vd): assert laliga_vd.shape() == vdf.shape() # testing insert - path = os.path.dirname(verticapy.__file__) + "/data/laliga/" - vdf=read_file( - path + "2005.json", - ) - vdf=read_file( - path + "2005.json", - table_name=vdf._VERTICAPY_VARIABLES_["input_relation"], - insert=True, + vdf = read_file(path) + vdf = read_file( + path, table_name=vdf._VERTICAPY_VARIABLES_["input_relation"], insert=True, ) - assert vdf.shape()==(34,14) - + assert vdf.shape() == (904, 14) + def test_read_shp(self, cities_vd): drop(name="public.cities_test") cities_vd.to_shp("cities_test", "/home/dbadmin/", shape="Point") diff --git a/verticapy/tests/vDataFrame/test_vDF_utilities.py b/verticapy/tests/vDataFrame/test_vDF_utilities.py index 6462766fd..3abb4f526 100755 --- a/verticapy/tests/vDataFrame/test_vDF_utilities.py +++ b/verticapy/tests/vDataFrame/test_vDF_utilities.py @@ -28,6 +28,8 @@ drop, set_option, read_shp, + read_csv, + read_json, ) from verticapy.connect import current_cursor import verticapy.stats as st @@ -165,7 +167,25 @@ def test_vDF_to_csv(self, titanic_vd): raise os.remove("verticapy_test_to_csv.csv") file.close() - # TODO - test with multiple CSV files. + # multiple files + try: + titanic_vd.to_csv( + "titanic_verticapy_test_to_csv", + n_files=3, + order_by=["name", "age", "fare"], + ) + titanic_test = read_csv("titanic_verticapy_test_to_csv/*.csv") + assert titanic_test.shape() == (1234, 14) + except: + os.remove("titanic_verticapy_test_to_csv/1.csv") + os.remove("titanic_verticapy_test_to_csv/2.csv") + os.remove("titanic_verticapy_test_to_csv/3.csv") + os.rmdir("titanic_verticapy_test_to_csv") + raise + os.remove("titanic_verticapy_test_to_csv/1.csv") + os.remove("titanic_verticapy_test_to_csv/2.csv") + os.remove("titanic_verticapy_test_to_csv/3.csv") + os.rmdir("titanic_verticapy_test_to_csv") def test_vDF_to_parquet(self, titanic_vd): session_id = get_session() @@ -283,7 +303,25 @@ def test_vDF_to_json(self, titanic_vd): raise os.remove("verticapy_test_to_json.json") file.close() - # TODO - test with multiple JSON files. + # multiple files + try: + titanic_vd.to_json( + "titanic_verticapy_test_to_json", + n_files=3, + order_by=["name", "age", "fare"], + ) + titanic_test = read_json("titanic_verticapy_test_to_json/*.json") + assert titanic_test.shape() == (1234, 14) + except: + os.remove("titanic_verticapy_test_to_json/1.json") + os.remove("titanic_verticapy_test_to_json/2.json") + os.remove("titanic_verticapy_test_to_json/3.json") + os.rmdir("titanic_verticapy_test_to_json") + raise + os.remove("titanic_verticapy_test_to_json/1.json") + os.remove("titanic_verticapy_test_to_json/2.json") + os.remove("titanic_verticapy_test_to_json/3.json") + os.rmdir("titanic_verticapy_test_to_json") def test_vDF_to_list(self, titanic_vd): result = titanic_vd.select(["age", "survived"])[:20].to_list() diff --git a/verticapy/toolbox.py b/verticapy/toolbox.py index d3b5748a4..eb5bb06c0 100755 --- a/verticapy/toolbox.py +++ b/verticapy/toolbox.py @@ -180,6 +180,16 @@ def color_dict(d: dict, idx: int = 0): return gen_colors()[idx % len(gen_colors())] +# ---# +def find_val_in_dict(x: str, d: dict, return_key: bool = False): + for elem in d: + if quote_ident(x).lower() == quote_ident(elem).lower(): + if return_key: + return elem + return d[elem] + raise NameError(f'Key "{x}" was not found in {d}.') + + # ---# def flat_dict(d: dict) -> str: # converts dictionary to string with a specific format diff --git a/verticapy/utilities.py b/verticapy/utilities.py index 1fc06c840..5a2cea401 100644 --- a/verticapy/utilities.py +++ b/verticapy/utilities.py @@ -1931,9 +1931,11 @@ def read_csv( for i in range(len(file_header) - len(header_names)) ] if not (materialize): - suffix = "" - final_relation = input_relation - prefix = " ON COMMIT PRESERVE ROWS;" + suffix, prefix, final_relation = ( + "", + " ON COMMIT PRESERVE ROWS;", + input_relation, + ) if temporary_local_table: suffix = "LOCAL TEMP " final_relation = table_name @@ -2005,7 +2007,8 @@ def read_csv( os.remove(path_test) dtype_sorted = {} for elem in header_names: - dtype_sorted[elem] = dtype[elem] + key = find_val_in_dict(elem, dtype, return_key=True) + dtype_sorted[key] = dtype[key] query1 = create_table( table_name, dtype_sorted, @@ -2314,12 +2317,14 @@ def read_json( input_relation = quote_ident(table_name) all_queries = [] if not (materialize): - suffix = "" + suffix, prefix = "", "ON COMMIT PRESERVE ROWS;" if temporary_local_table: suffix = "LOCAL TEMP " elif temporary_table: suffix = "TEMP " - query = f"CREATE FLEX {suffix}TABLE {input_relation}(x int) ON COMMIT PRESERVE ROWS;" + else: + prefix = ";" + query = f"CREATE FLEX {suffix}TABLE {input_relation}(x int){prefix}" else: flex_name = gen_tmp_name(name="flex")[1:-1] query = f"CREATE FLEX LOCAL TEMP TABLE {flex_name}(x int) ON COMMIT PRESERVE ROWS;" @@ -2338,7 +2343,9 @@ def read_json( else: options += ["suppress_nonalphanumeric_key_chars=false"] if reject_on_materialized_type_error: - assert materialize, ParameterError('When using complex data types the table has to be materialized. Set materialize to True') + assert materialize, ParameterError( + "When using complex data types the table has to be materialized. Set materialize to True" + ) options += ["reject_on_materialized_type_error=true"] else: options += ["reject_on_materialized_type_error=false"] @@ -2956,17 +2963,11 @@ def __init__( # ---# def __iter__(self): - columns = self.values - return (elem for elem in columns) + return (elem for elem in self.values) # ---# def __getitem__(self, key): - all_cols = [elem for elem in self.values] - for elem in all_cols: - if quote_ident(str(elem).lower()) == quote_ident(str(key).lower()): - key = elem - break - return self.values[key] + return find_val_in_dict(key, self.values) # ---# def _repr_html_(self, interactive=False): diff --git a/verticapy/vdataframe.py b/verticapy/vdataframe.py index e6508d568..d4a793476 100755 --- a/verticapy/vdataframe.py +++ b/verticapy/vdataframe.py @@ -11590,7 +11590,7 @@ def to_csv( if not (order_by): order_by = self.__get_last_order_by__() if n_files > 1 and path: - os.makedirs(file_name) + os.makedirs(path) csv_files = [] while current_nb_rows_written < total: if new_header: @@ -11962,7 +11962,7 @@ def to_json( if not (order_by): order_by = self.__get_last_order_by__() if n_files > 1 and path: - os.makedirs(file_name) + os.makedirs(path) if not (path): json_files = [] while current_nb_rows_written < total: