Skip to content

Commit

Permalink
first
Browse files Browse the repository at this point in the history
  • Loading branch information
Jianyu03 committed Jun 17, 2024
0 parents commit 9da56cb
Show file tree
Hide file tree
Showing 67 changed files with 17,748 additions and 0 deletions.
86 changes: 86 additions & 0 deletions conversation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from fastchat.conversation import Conversation, SeparatorStyle

# 找tokenizer_config.json

CONVS = {
'Qwen1.5': Conversation(
name="qwen",
system_template="<|im_start|>system\n{system_message}",
system_message="You are a helpful assistant.",
roles=("<|im_start|>user", "<|im_start|>assistant"),
sep_style=SeparatorStyle.CHATML,
sep="<|im_end|>",
stop_token_ids=[
151643,
151644,
151645,
14582,
], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
stop_str="<|endoftext|>",
),
'Qwen1.5-Chat': Conversation(
name="qwen",
system_template="<|im_start|>system\n{system_message}",
system_message="You are a helpful assistant.",
roles=("<|im_start|>user", "<|im_start|>assistant"),
sep_style=SeparatorStyle.CHATML,
sep="<|im_end|>",
stop_token_ids=[
151643,
151644,
151645,
14582,
], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
stop_str="<|endoftext|>",
),
'Qwen': Conversation(
name="qwen",
system_template="<|im_start|>system\n{system_message}",
system_message="You are a helpful assistant.",
roles=("<|im_start|>user", "<|im_start|>assistant"),
sep_style=SeparatorStyle.CHATML,
sep="<|im_end|>",
stop_token_ids=[
151643,
151644,
151645,
14582,
], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
stop_str="<|endoftext|>",
),
'Llama-2': Conversation(
name="llama-2",
system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
roles=("[INST]", "[/INST]"),
sep_style=SeparatorStyle.LLAMA2,
sep=" ",
sep2=" </s><s>",
)
}


def generate_inputs(standard_conv, prompt_q, tokenizer):
# print(standard_conv.name)
if standard_conv.name == 'llama-2':
conv = standard_conv.copy()
conv.set_system_message("You will write beautiful compliments according to needs")
# conv.append_message("<|user|>", prompt_q)
# conv.append_message("<|assistant|>", None)
conv.append_message("[INST]", prompt_q)
conv.append_message("[/INST]", None)
inputs = tokenizer(
conv.get_prompt(),
return_tensors='pt'
)["input_ids"]
elif standard_conv.name == 'qwen':
conv = standard_conv.copy()
conv.set_system_message("You will write beautiful compliments according to needs")
conv.append_message("<|im_start|>user", prompt_q)
conv.append_message("<|im_start|>assistant", None)
inputs = tokenizer(
conv.get_prompt(),
return_tensors='pt'
)["input_ids"]
else:
return None
return inputs
500 changes: 500 additions & 0 deletions data/MATH-500/test.jsonl

Large diffs are not rendered by default.

12,000 changes: 12,000 additions & 0 deletions data/MATH-500/train.jsonl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/MMLU-STEM/dataset_dict.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"splits": ["train"]}
Binary file added data/MMLU-STEM/train/cache-371d744ad5ee6a7b.arrow
Binary file not shown.
Binary file added data/MMLU-STEM/train/data-00000-of-00001.arrow
Binary file not shown.
53 changes: 53 additions & 0 deletions data/MMLU-STEM/train/dataset_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"builder_name": "json",
"citation": "",
"config_name": "default",
"dataset_name": "mmlu-stem",
"dataset_size": 1000011,
"description": "",
"download_checksums": {
"hf://datasets/TIGER-Lab/MMLU-STEM@299b1a95bf25a953b7af2f71dacb39fa7a5299c5/stem.json": {
"num_bytes": 1291805,
"checksum": null
}
},
"download_size": 1291805,
"features": {
"question": {
"dtype": "string",
"_type": "Value"
},
"subject": {
"dtype": "string",
"_type": "Value"
},
"choices": {
"feature": {
"dtype": "string",
"_type": "Value"
},
"_type": "Sequence"
},
"answer": {
"dtype": "int64",
"_type": "Value"
}
},
"homepage": "",
"license": "",
"size_in_bytes": 2291816,
"splits": {
"train": {
"name": "train",
"num_bytes": 1000011,
"num_examples": 3134,
"dataset_name": "mmlu-stem"
}
},
"version": {
"version_str": "0.0.0",
"major": 0,
"minor": 0,
"patch": 0
}
}
13 changes: 13 additions & 0 deletions data/MMLU-STEM/train/state.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"_data_files": [
{
"filename": "data-00000-of-00001.arrow"
}
],
"_fingerprint": "d699cf3943feae0b",
"_format_columns": null,
"_format_kwargs": {},
"_format_type": null,
"_output_all_columns": false,
"_split": "train"
}
80 changes: 80 additions & 0 deletions data/MTbench_question.jsonl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/gsm8k/dataset_dict.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"splits": ["train", "test"]}
Binary file added data/gsm8k/test/cache-82291f8805e0b982.arrow
Binary file not shown.
Binary file added data/gsm8k/test/cache-8f0f4271a35415fe.arrow
Binary file not shown.
Binary file added data/gsm8k/test/data-00000-of-00001.arrow
Binary file not shown.
52 changes: 52 additions & 0 deletions data/gsm8k/test/dataset_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"builder_name": "parquet",
"citation": "",
"config_name": "main",
"dataset_name": "gsm8k",
"dataset_size": 4676934,
"description": "",
"download_checksums": {
"hf://datasets/gsm8k@e53f048856ff4f594e959d75785d2c2d37b678ee/main/train-00000-of-00001.parquet": {
"num_bytes": 2306545,
"checksum": null
},
"hf://datasets/gsm8k@e53f048856ff4f594e959d75785d2c2d37b678ee/main/test-00000-of-00001.parquet": {
"num_bytes": 419088,
"checksum": null
}
},
"download_size": 2725633,
"features": {
"question": {
"dtype": "string",
"_type": "Value"
},
"answer": {
"dtype": "string",
"_type": "Value"
}
},
"homepage": "",
"license": "",
"size_in_bytes": 7402567,
"splits": {
"train": {
"name": "train",
"num_bytes": 3963202,
"num_examples": 7473,
"dataset_name": "gsm8k"
},
"test": {
"name": "test",
"num_bytes": 713732,
"num_examples": 1319,
"dataset_name": "gsm8k"
}
},
"version": {
"version_str": "0.0.0",
"major": 0,
"minor": 0,
"patch": 0
}
}
13 changes: 13 additions & 0 deletions data/gsm8k/test/state.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"_data_files": [
{
"filename": "data-00000-of-00001.arrow"
}
],
"_fingerprint": "77dafb2c804c8978",
"_format_columns": null,
"_format_kwargs": {},
"_format_type": null,
"_output_all_columns": false,
"_split": "test"
}
Binary file added data/gsm8k/train/data-00000-of-00001.arrow
Binary file not shown.
52 changes: 52 additions & 0 deletions data/gsm8k/train/dataset_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"builder_name": "parquet",
"citation": "",
"config_name": "main",
"dataset_name": "gsm8k",
"dataset_size": 4676934,
"description": "",
"download_checksums": {
"hf://datasets/gsm8k@e53f048856ff4f594e959d75785d2c2d37b678ee/main/train-00000-of-00001.parquet": {
"num_bytes": 2306545,
"checksum": null
},
"hf://datasets/gsm8k@e53f048856ff4f594e959d75785d2c2d37b678ee/main/test-00000-of-00001.parquet": {
"num_bytes": 419088,
"checksum": null
}
},
"download_size": 2725633,
"features": {
"question": {
"dtype": "string",
"_type": "Value"
},
"answer": {
"dtype": "string",
"_type": "Value"
}
},
"homepage": "",
"license": "",
"size_in_bytes": 7402567,
"splits": {
"train": {
"name": "train",
"num_bytes": 3963202,
"num_examples": 7473,
"dataset_name": "gsm8k"
},
"test": {
"name": "test",
"num_bytes": 713732,
"num_examples": 1319,
"dataset_name": "gsm8k"
}
},
"version": {
"version_str": "0.0.0",
"major": 0,
"minor": 0,
"patch": 0
}
}
13 changes: 13 additions & 0 deletions data/gsm8k/train/state.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"_data_files": [
{
"filename": "data-00000-of-00001.arrow"
}
],
"_fingerprint": "6c618aa7490feacd",
"_format_columns": null,
"_format_kwargs": {},
"_format_type": null,
"_output_all_columns": false,
"_split": "train"
}
Loading

0 comments on commit 9da56cb

Please sign in to comment.