From 83e2f6277dbf5765d561eb0a8bc32829d63d98e2 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Tue, 14 Nov 2023 15:00:23 +0000 Subject: [PATCH 1/3] add server generation --- examples/run_text_generation_starcoder.sh | 46 ++++++++++++ megatron/text_generation_server.py | 2 +- tools/run_requests_humaneval.py | 85 +++++++++++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 examples/run_text_generation_starcoder.sh create mode 100644 tools/run_requests_humaneval.py diff --git a/examples/run_text_generation_starcoder.sh b/examples/run_text_generation_starcoder.sh new file mode 100644 index 0000000000..0de1a44fea --- /dev/null +++ b/examples/run_text_generation_starcoder.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# This example will start serving the 1B model. +# You may need to adapt Flask port if it's occupied in MegatronServer class, we chnaged it from 5000 (default) to 8080 +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr ip-26-0-156-56 \ + --master_port 6000" + + +CHECKPOINT=/fsx/loubna/data/extra/generations_starcoder2_1b_200k/megatron + +#/mp_rank_00/model_optim_rng.pt +VOCAB_FILE=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints/conversions/vocab.json +MERGE_FILE=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints/conversions/merges.txt +TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +#pip install flask-restful + +torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 2048 \ + --num-attention-heads 16 \ + --attention-head-type multiquery \ + --init-method-std 0.02209 \ + --seq-length 4096 \ + --use-rotary-position-embeddings \ + --max-position-embeddings 4096 \ + --rotary-theta 100000 \ + --attention-dropout 0.1 \ + --hidden-dropout 0.1 \ + --load ${CHECKPOINT} \ + --tokenizer-type TokenizerFromFile \ + --tokenizer-file $TOKENIZER_FILE \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 1024 \ + --out-seq-length 512 \ + --temperature 0 \ + --top_p 0.9 \ + --seed 42 + --output_file diff --git a/megatron/text_generation_server.py b/megatron/text_generation_server.py index 58550f2e63..80dd0b288a 100644 --- a/megatron/text_generation_server.py +++ b/megatron/text_generation_server.py @@ -238,4 +238,4 @@ def __init__(self, model): api.add_resource(MegatronGenerate, '/api', resource_class_args=[model]) def run(self, url): - self.app.run(url, threaded=True, debug=False) + self.app.run(url, threaded=True, debug=False, port=8080) diff --git a/tools/run_requests_humaneval.py b/tools/run_requests_humaneval.py new file mode 100644 index 0000000000..115f4d4b1d --- /dev/null +++ b/tools/run_requests_humaneval.py @@ -0,0 +1,85 @@ +import requests +import json +from human_eval.data import write_jsonl, read_problems + + +NUM_SAMPLES_PER_TASK = 1 +stop_tokens = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "", "", "<|endoftext|>"] + + +def query_server(prompt): + url = 'http://localhost:8080/api' + headers = {'Content-Type': 'application/json; charset=UTF-8'} + data = {"prompts": [prompt], "tokens_to_generate": 512} + response = requests.put(url, json=data, headers=headers) + result = json.loads(response.text)["text"] + return result[0] + + +def stop_at_stop_token(decoded_string, stop_tokens): + """ + Produces the prefix of decoded_string that ends at the first occurrence of + a stop_token. + WARNING: the decoded_string *must not* include the prompt, which may have stop tokens + itself. + """ + min_stop_index = len(decoded_string) + for stop_token in stop_tokens: + stop_index = decoded_string.find(stop_token) + if stop_index != -1 and stop_index < min_stop_index: + min_stop_index = stop_index + return decoded_string[:min_stop_index] + + +def postprocess_generation(generation, prompt): + """Defines the postprocessing for a LM generation. + :param generation: str + code generation from LM + :param idx: int + (not used for Humaneval-Task) + """ + if not generation.startswith(prompt[:20]): + print(f"issue with generation: {generation}") + print(f"origin prompt: {prompt}") + generation = generation[len(prompt) :] + return prompt + stop_at_stop_token(generation, stop_tokens) + + +def main(): + problems = read_problems() + prompts = [ + problems[task_id]["prompt"] + for task_id in problems + for _ in range(NUM_SAMPLES_PER_TASK) + ] + + errors = [] + success = 0 + generations = [] + postprocessed_generations = [] + for i, prompt in enumerate(prompts): + prompt = prompt.strip() + try: + result = query_server(prompt) + generations.append([result]) + postprocessed_generations.append([postprocess_generation(result, prompt)]) + success += 1 + except Exception as e: + print(f"Error processing problem '{i}': {e}") + errors.append(i) + if i % 10 == 0: + print(f"Processed {i} problems") + print(f"Failed problem generations are: {errors}") + #print(f"Example:\n{result}END\n") + + print(f"Done! {success} successful problems out of {len(prompts)}, failed are: {errors}") + + with open('megatron_generations.json', 'w') as f: + json.dump(generations, f) + + with open('megatron_postprocessed_generations.json', 'w') as f: + json.dump(postprocessed_generations, f) + + +if __name__ == '__main__': + main() From 2f6d17222146c516d9822e32589b644621c4564e Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:33:16 +0000 Subject: [PATCH 2/3] update seq length --- examples/run_text_generation_starcoder.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/run_text_generation_starcoder.sh b/examples/run_text_generation_starcoder.sh index 0de1a44fea..1f6ca3c53a 100644 --- a/examples/run_text_generation_starcoder.sh +++ b/examples/run_text_generation_starcoder.sh @@ -38,9 +38,7 @@ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ --tokenizer-file $TOKENIZER_FILE \ --bf16 \ --micro-batch-size 1 \ - --seq-length 1024 \ --out-seq-length 512 \ --temperature 0 \ - --top_p 0.9 \ --seed 42 --output_file From 6de3bd1bc69b0dd999899c06448fdc604febf780 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Tue, 14 Nov 2023 17:48:55 +0000 Subject: [PATCH 3/3] fix temperature --- examples/run_text_generation_starcoder.sh | 5 +---- tools/run_requests_humaneval.py | 10 ++++------ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/run_text_generation_starcoder.sh b/examples/run_text_generation_starcoder.sh index 0de1a44fea..b2feb7bf68 100644 --- a/examples/run_text_generation_starcoder.sh +++ b/examples/run_text_generation_starcoder.sh @@ -4,7 +4,7 @@ DISTRIBUTED_ARGS="--nproc_per_node 1 \ --nnodes 1 \ --node_rank 0 \ - --master_addr ip-26-0-156-56 \ + --master_addr ip-26-0-156-228 \ --master_port 6000" @@ -38,9 +38,6 @@ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ --tokenizer-file $TOKENIZER_FILE \ --bf16 \ --micro-batch-size 1 \ - --seq-length 1024 \ --out-seq-length 512 \ - --temperature 0 \ - --top_p 0.9 \ --seed 42 --output_file diff --git a/tools/run_requests_humaneval.py b/tools/run_requests_humaneval.py index 115f4d4b1d..ff52c54cd9 100644 --- a/tools/run_requests_humaneval.py +++ b/tools/run_requests_humaneval.py @@ -7,10 +7,10 @@ stop_tokens = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "", "", "<|endoftext|>"] -def query_server(prompt): +def query_server(prompt, temperature=0.1): url = 'http://localhost:8080/api' headers = {'Content-Type': 'application/json; charset=UTF-8'} - data = {"prompts": [prompt], "tokens_to_generate": 512} + data = {"prompts": [prompt], "tokens_to_generate": 256, "temperature": 0.00001, "stop_token": 0, "random_seed": 1234} response = requests.put(url, json=data, headers=headers) result = json.loads(response.text)["text"] return result[0] @@ -50,9 +50,7 @@ def main(): prompts = [ problems[task_id]["prompt"] for task_id in problems - for _ in range(NUM_SAMPLES_PER_TASK) ] - errors = [] success = 0 generations = [] @@ -74,10 +72,10 @@ def main(): print(f"Done! {success} successful problems out of {len(prompts)}, failed are: {errors}") - with open('megatron_generations.json', 'w') as f: + with open('megatron_generations_fixtemp_50.json', 'w') as f: json.dump(generations, f) - with open('megatron_postprocessed_generations.json', 'w') as f: + with open('megatron_postprocessed_generations_fixtemp_50.json', 'w') as f: json.dump(postprocessed_generations, f)