Skip to content

Commit

Permalink
Use the dtype parameter of pd.read_csv.
Browse files Browse the repository at this point in the history
  • Loading branch information
monster29000 committed Aug 17, 2024
1 parent f1e6ed9 commit 01d86e4
Show file tree
Hide file tree
Showing 6 changed files with 6 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def num_tokens_from_string(string: str) -> int:


if __name__ == "__main__":
# Use the `dtype` parameter of `pd.read_csv`.
sampled_df = pd.read_csv("wiki_qa_bart_10000row_input.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd

if __name__ == "__main__":
# Use the `dtype` parameter of `pd.read_csv`.
raw_df = pd.read_csv(r"...\wiki_qa_bart_10000row.csv")
# print(raw_df.iloc[0])
# print(raw_df.columns)
Expand Down
1 change: 1 addition & 0 deletions data/datasets/mt_note_generation/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def main(output_dir: str = "data"):
"""Download and prepare the dataset for use."""
os.makedirs(output_dir, exist_ok=True)
kaggle.api.dataset_download_files("tboyle10/medicaltranscriptions", "data", unzip=True)
# Use the `dtype` parameter of `pd.read_csv`.
mt_samples = preprocess(pd.read_csv("data/mtsamples.csv"))
conversations = get_conversations(mt_samples)
random.shuffle(conversations)
Expand Down
1 change: 1 addition & 0 deletions data/datasets/poetry_instruction/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

# Read the CSV file into a pandas dataframe
csv_file = os.path.join(download_path, "PoetryFoundationData.csv")
# Use the `dtype` parameter of `pd.read_csv`.
df = pd.read_csv(csv_file)

# The data in the CSV file is not formatted correctly, so we need to clean it up.
Expand Down
1 change: 1 addition & 0 deletions data/datasets/zhihu-kol/convert_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def reformat_csv_to_openassistant(df: pd.DataFrame) -> pd.DataFrame:
if __name__ == "__main__":
input_csv = "zhihu.csv"
# Create a pandas dataframe from your dataset file(s)
# Use the `dtype` parameter of `pd.read_csv`.
df = pd.read_csv(input_csv) # or any other way
df = reformat_csv_to_openassistant(df)
# Save the file in the Parquet format
Expand Down
1 change: 1 addition & 0 deletions scripts/data_augment/data_augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ def parse_arguments():


def read_data(args):
# Use the `dtype` parameter of `pd.read_csv`.
files = pd.read_csv(args.dataset, sep=",", header=None, names=["file"])
files = files["file"].tolist()
data = []
Expand Down

0 comments on commit 01d86e4

Please sign in to comment.