Use the dtype parameter of pd.read_csv.

LAION-AI · Aug 17, 2024 · 01d86e4 · 01d86e4
1 parent f1e6ed9
commit 01d86e4
Show file tree

Hide file tree

Showing 6 changed files with 6 additions and 0 deletions.
diff --git a/data/datasets/bart_searchgpt_wiki_nlp_augment/3_10k_bart_trial.py b/data/datasets/bart_searchgpt_wiki_nlp_augment/3_10k_bart_trial.py
@@ -12,6 +12,7 @@ def num_tokens_from_string(string: str) -> int:
 
 
 if __name__ == "__main__":
+    # Use the `dtype` parameter of `pd.read_csv`.
     sampled_df = pd.read_csv("wiki_qa_bart_10000row_input.csv")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(device)

diff --git a/data/datasets/bart_searchgpt_wiki_nlp_augment/4_convert_to_oa_format.py b/data/datasets/bart_searchgpt_wiki_nlp_augment/4_convert_to_oa_format.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 if __name__ == "__main__":
+    # Use the `dtype` parameter of `pd.read_csv`.
     raw_df = pd.read_csv(r"...\wiki_qa_bart_10000row.csv")
     # print(raw_df.iloc[0])
     # print(raw_df.columns)

diff --git a/data/datasets/mt_note_generation/prepare.py b/data/datasets/mt_note_generation/prepare.py
@@ -63,6 +63,7 @@ def main(output_dir: str = "data"):
     """Download and prepare the dataset for use."""
     os.makedirs(output_dir, exist_ok=True)
     kaggle.api.dataset_download_files("tboyle10/medicaltranscriptions", "data", unzip=True)
+    # Use the `dtype` parameter of `pd.read_csv`.
     mt_samples = preprocess(pd.read_csv("data/mtsamples.csv"))
     conversations = get_conversations(mt_samples)
     random.shuffle(conversations)

diff --git a/data/datasets/poetry_instruction/prepare.py b/data/datasets/poetry_instruction/prepare.py
@@ -14,6 +14,7 @@
 
 # Read the CSV file into a pandas dataframe
 csv_file = os.path.join(download_path, "PoetryFoundationData.csv")
+# Use the `dtype` parameter of `pd.read_csv`.
 df = pd.read_csv(csv_file)
 
 # The data in the CSV file is not formatted correctly, so we need to clean it up.

diff --git a/data/datasets/zhihu-kol/convert_parquet.py b/data/datasets/zhihu-kol/convert_parquet.py
@@ -43,6 +43,7 @@ def reformat_csv_to_openassistant(df: pd.DataFrame) -> pd.DataFrame:
 if __name__ == "__main__":
     input_csv = "zhihu.csv"
     # Create a pandas dataframe from your dataset file(s)
+    # Use the `dtype` parameter of `pd.read_csv`.
     df = pd.read_csv(input_csv)  # or any other way
     df = reformat_csv_to_openassistant(df)
     # Save the file in the Parquet format

diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py
@@ -458,6 +458,7 @@ def parse_arguments():
 
 
 def read_data(args):
+    # Use the `dtype` parameter of `pd.read_csv`.
     files = pd.read_csv(args.dataset, sep=",", header=None, names=["file"])
     files = files["file"].tolist()
     data = []