run.c - Output buffering

Previously much of time was spent writing to screen which is relatively slow. By enabling output buffering more work can be performed by writing groups of computed tokens to the buffer which is relatively fast, and then flushing the buffer periodically to screen/console. Testing with the smallest model, a interactive tokens/s speed up of ~14% on standard builds to ~84% on open-mp builds has been achieved. Usage: run <checkpoint_file> [temperature] [steps] [prompt] [buffer_tokens] Where buffer_tokens is the number of tokens to be buffered. Multiples of 2 seem to be ideal. 64 worked well for my use case on a low end machine. The speed up may depend on model size and OS. Example: ./run model.bin 0 0 "A car" 64
trholding · Jul 31, 2023 · f1ff0c8 · f1ff0c8
1 parent 1c4f129
commit f1ff0c8
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 4 deletions.
diff --git a/run b/run
diff --git a/run.c b/run.c
@@ -485,14 +485,15 @@ int main(int argc, char *argv[]) {
     float temperature = 0.9f; // e.g. 1.0, or 0.0
     int steps = 256;          // max number of steps to run for, 0: use seq_len
     char *prompt = NULL;      // prompt string
-
-    // 'checkpoint' is necessary arg
+    int buffertokens = 1;     // output token buffer size
+
     #ifdef COSMO_ZIP // if this is defined
     // we read the embedded checkpoint from within the executable
+    // 'checkpoint' is necessary arg
     checkpoint = "/zip/out/model.bin" ;
     #else
     if (argc < 2) {
-        printf("Usage: %s <checkpoint_file> [temperature] [steps] [prompt]\n", argv[0]);
+        printf("Usage: %s <checkpoint_file> [temperature] [steps] [prompt] [buffer_tokens]\n", argv[0]);
         return 1;
     }    
     if (argc >= 2) {
@@ -508,6 +509,9 @@ int main(int argc, char *argv[]) {
     if (argc >= 5) {
         prompt = argv[4];
     }
+    if (argc >= 6) {
+        buffertokens = atoi(argv[5]);
+    }    
     #endif
 
     // seed rng with time. if you want deterministic behavior use temperature 0.0
@@ -583,7 +587,11 @@ int main(int argc, char *argv[]) {
     int next;        // will store the next token in the sequence
     int token = 1;   // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
     int pos = 0;     // position in the sequence
+    int bufferflush = 1; // buffer flush after token counter 
+    char outbuff[2048]; // used for output buffering              
+    memset( outbuff, '\0', sizeof( outbuff )); // clear buffer area
     printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
+    setvbuf(stdout, outbuff, _IOFBF, 2048); // setup output buffering
     while (pos < steps) {
 
         // forward the transformer to get logits for the next token
@@ -610,7 +618,7 @@ int main(int argc, char *argv[]) {
         // following BOS token (1), sentencepiece decoder strips any leading whitespace (see PR #89)
         char *token_str = (token == 1 && vocab[next][0] == ' ') ? vocab[next]+1 : vocab[next];
         printf("%s", token_str);
-        fflush(stdout);
+        if (bufferflush==pos) { fflush(stdout); bufferflush+=buffertokens; } // flush after every n tokens
 
         // advance forward
         token = next;