From 8e91b1bfa8e230f1729c7eb8e4ef4728ef244588 Mon Sep 17 00:00:00 2001 From: Mahesh Sathiamoorthy Date: Sat, 16 Nov 2024 07:18:17 +0000 Subject: [PATCH 1/8] Update readme with an overview and key features, which should explain why the user should care about Curator. --- README.md | 23 +++++++++++++++++++++-- docs/Bespoke-Labs-Logomark-Red-Small.png | Bin 0 -> 4523 bytes 2 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 docs/Bespoke-Labs-Logomark-Red-Small.png diff --git a/README.md b/README.md index 92c080df..a9b6852f 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@

- - Bespoke Labs Logo + + Bespoke Labs Logo

@@ -26,6 +26,22 @@

+### Overview + +Bespoke Curator makes it very easy to create high-quality synthetic data at scale, which you can use to finetune models or use for structured data extraction at scale. + +Bespoke Curator is an open-source project: +* That comes with a rich Python based library for generating and curating synthetic data. +* A Curator Viewer which makes it easy to view the datasets, thus aiding in the dataset creation. +* We will also be releasing high-quality datasets that should move the needle on post-training. + +### Key Features + +1. **Programmability and Structured Outputs**: Synthetic data generation is lot more than just calling one prompt. It is a sequence of calling LLMs. You can orchestrate complex pipelines of calling LLMs and use structured output to decide on control-flow. Curator treats structured outputs as first class citizens. +2. **Built-in Performance Optimization**: We often see calling LLMs in loops, or inefficient implementation of multi-threading. We have baked in performance optimizations so that you don't need to worry about those! +3. **Intelligent Caching and Fault Recovery**: Given LLM calls can add up in cost and time, failures are undesirable but sometimes unavoidable. We cache the LLM requests and responses so that it is easy to recover from a failure. Moreover, when working on a multi-stage pipeline, caching of stages makes it easy to iterate. +4. **Native HuggingFace Dataset Integration**: Work directly on HuggingFace Dataset objects throughput your pipeline. Your synthetic data is immediately ready for fine-tuning! +5. **Interactive Curator Viewer**: Improve and iterate on your prompts using our built-in viewer. Inspect LLM requests and responses in real-time, allowing you to iterate and refine your data generation strategy with immediate feedback. ### Installation @@ -134,3 +150,6 @@ node -v # should print `v22.11.0` # verifies the right npm version is in the environment npm -v # should print `10.9.0` ``` + +## Contributing +Contributions are welcome! \ No newline at end of file diff --git a/docs/Bespoke-Labs-Logomark-Red-Small.png b/docs/Bespoke-Labs-Logomark-Red-Small.png new file mode 100644 index 0000000000000000000000000000000000000000..633b7715541227ba299d6bb383805224a1e9ab92 GIT binary patch literal 4523 zcmcgwcT`jBvQH2^G!NBqgaZVWP7;Cw5-HN8L!>BzUcHb3m|r=~?Q5K%`WjEmsa;4RtfIu>^rG%7Q>KmqDOS zz!WnN0)?u8K%X!m5Gof05(&83a#9O$pgkN+JuNLkih#}uVgn0;_5d0f_<)kULG0gk z5D16_9D9nve|wAB{_?gK@A*p~-vyriEfE0%v8j67IfOV^TASf5hy_4H6x^<-ytCvw51&^(NF{bAP=ZITm`Not-}q4 zLbZb2JyAA>C;p@ZD{blXAt3=M1R^{<93Flc?jPiZKx%4gB2-ins;V#m0Sk`6g}CBj zxZs07g8Va%p+~S=kas|cw?7WL8`l-<9~z=9ExnuQuj5CZ9(eD+GvR{&+!nAw#I6T{ zgsUL_FItGV=l?_7_57fH-`9_HTD!qeXm7lS?-@gHKM!0mP@0Y^Qcdf7nt$>9UFc6t zhrgM~qZ&UmfAaiqrh&h&e~@i}tDA=o@=wZ7rk~Keb)&3+sZIEt44b|LmE4C`~yJSLJRzQ5aNp+2W2_aY(O9phxU` zh+9q@+*#8hTypggBJnPVn1&Tej4!P=@?mgcjuP(1 z;>koM$;O5(uYQRJb#kHgaNkbc$p%-8}DT)1l~R^S@*`9)=*AyOT}oj z!jWe6PSz(cmZ51EecyO$Cb#cck(w+sTE0fmow7Gedgb|u+I_WZA43@VOw?;Jt5@Yb zT3>@}#CSZ+xvkEQh#ea@FsFm;%@XdZZO!ZWz9k0Ww$xnt^*)+4sWzM0TE$~v&u<5ZxzYyq|<~Wm!1_MUep-shkx`amZMTOmxLoRPiF>Q zmXkKGYkQaaP!;Get{CS_MlrM=D8*MFZ9&=>VN(?xS34k5vcvn$3|7YrbFei9%_Xbd z_b>T96AwKt9~Sylxr-Z2q?#R?PeOr-&f-X(TKyPolOWe-iB$~twDADjAN<9XYQs1} zrBE?tOrRd5#4aDElwK_P36TFyy2@9TICf^*)5bDKlgnG`AyON8O`>31Y&fnhxjOCM z7<76Rzxj56$>yK4+ljYB|wx6bK3bQs!W?sx} z_+qsW3$ehLcb4UujnP>rnjsu1*mH(*tYCiD3a5x+bU5Q~@-1xYxfkZIifL{R42Sj4 z^^p@9d5&4vqUfg*g7f;eI{k|fnJt#y@^p2hdPnCvpWYekL7c@F(tTg{q2nY37qNF* z?l`GO(%==wluekKQJkwxU=xn~p8$-{JSjwpuja8~xvF{VPT+X!!KM)ym9r*m6meFs ze=Q|C4KDkQUY^-LeLA;9G*_@y3V!$gp$McQm=(G_70qwFe39=ZetV1l%NF=%vd@B$ zFmt5xljYHEy;=FO&g-d+i6vg!2o0eetuAHbEq=V6)N#`3lCgl-U$^{z71`?FFm=vd ztm9hQd2#bg>fBglcQMF8x2jX;B$$-8x(BzxWZc;sua~Po?BJ)EtMV0Sd7eya zM*O78kny81k4)i65BH`1_f8wB%lDYM!Syxh){rqOz6{e;XG&~Wtqb#P&f-hdSkp+A zHMlVyI-vYEq?AIQ5pB}bHLQ}t>*~K;=TIU`G^yLSG2(g}Bbo!Uv;!Wsym8+t#O9=U zUHD^zXM6$Q=3Y3`*(}zYvG*Ru?Czlsd02U5wn*8TOF9QYUjhR2zYVc%@)v(Ol+run zF4&}cHrP0RA#R`Teyu(n19?P25q<5m?&CpNO7c_!?b_9-lA6;~v$Q}}-IBUJN~xKe zeXPa=&9!53;NJ)#lIl3iY&q_HXI~=r+(5-bJ%^*cko9O+MwcJ2rB3Fa42_S_C{wuKd7Kc5t=h~cKZwj9?h?h7k}F4w^Li&;<(W(pH8u?SXN`GorqlLBRUra zI?H^Ea(_2A)@O}FMZN1%xRVjuuq+&Da)b9{VdHA&=y0F)=4a_e#|z$6g|PUz{Q*^T zbLle&Dz(8*bNRXJlg2|smgp%%@#k-q4S=aVLho+md*(`h97F7W_VaCdbUOT$+E)(RrD1k5%Sd^NIl|*oZ@$(W z)c|`Cz>I`#3b#Ve#D_)Yd}Pp`mnU8zoH@79Em$2{-cZ^*%A$3APU_W5IH96Up*15V z6f#4QUY&0Oxg->3Y}!tXBQ>4r&S#en-umg_RPtv&G;QkSNz!}fC?TtJ_13kakn;UEm~7~aBlS`WVX>9x^S_ib zj(sl8ioO!WJyo}5S>fzj@I1U9^RiR_*OmLcWC_9RqH7ny8>d`clUppjkB%Tj`A`AI zU-i;rZSxf9w#SvD*Q?vQA))+_u7~7+BaN36Tq>tK^X82d$+$Drxo;ntt1pwsgKg|s zs4SP`QwTfcJDo~eZ!fQNub71})3kZ`QU;{0`_LbxMfTJ_On=)gJG6Y5LdHHy9;b^t zTQ^(n$f5!tZ3r z4sya%pS*%FRHs@lU+)VkTjoI19;f>~e(P?NGvKpxUF>S(a&y|R?IxiIo1|Cw+@Tbp zEA;bNTzHj&qqOU@wDU<)2eD(@YVyyF7iCWN9KET7D8*}CM_dgqi6f&4u~uJ;p?FLYeGCb%8U3Lazdf6}mD zyJ=-jDv});m=j-kv(9=AtyRP}EwoF{H}2cB&Gp=h^vl5;$oEAoWjAlEhy6!&?G`5+ zW6G;nv~Cmfqr^hHx*eT7gpjNXMqsgNM%Qxqh-V|s()hjM#>gAv2MgX)1i6OVcr4$H zT%pq+GKHv9mo<<63UP4QSbJ#DG&*yBK+BTnbE{wzxlA$cev%X^X`Bd)JZf-~2QV3l z`=ld@?M9E>?IFO}C9*a(0TC#sI0;*v@<4)#QAo*Kegfcs0#ww=<3n>-Hcyyvzh#l1 zK!KsC=jMkBHDzBk&mIXO*>Rz1 zck$kD1%Wxx=E!`qA}|N;N2EAOA165x_ePzRIZnDaT^48#%zNg;#fZCrv~N3ty9y%X z6hzmJvOwhf#GGyzP#?@FzN1pGm}0*&(k#O!+*)@2g@`>M6|x;?iaXyE=>WJDWa$GaI1DG8?C8oG5_xv19^qriseb|u;A%SwX< gq5sh{4f;iwQ??{f^Ua9+?tdAkMplOR^)Ye(0AA4v-~a#s literal 0 HcmV?d00001 From bdf38f1144b5dd2c972f4fdb3815cc3ec1f52f02 Mon Sep 17 00:00:00 2001 From: Mahesh Sathiamoorthy Date: Sat, 16 Nov 2024 07:20:45 +0000 Subject: [PATCH 2/8] minor change --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a9b6852f..c4174615 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@

-

Bespoke Labs Curator

+

Bespoke Curator

Data Curation for Post-Training & Structured Data Extraction


From bb8ac4e4ece3fb83da81c7599afad12e13430072 Mon Sep 17 00:00:00 2001 From: Ryan Marten Date: Sun, 17 Nov 2024 10:48:00 -0800 Subject: [PATCH 3/8] add frequency and presence penalty --- examples/poem.py | 2 +- src/bespokelabs/curator/prompter/prompter.py | 14 +++++++++++++- .../openai_batch_request_processor.py | 10 ++++++++++ .../openai_online_request_processor.py | 10 ++++++++++ 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/examples/poem.py b/examples/poem.py index f7cfe704..5697e5e2 100644 --- a/examples/poem.py +++ b/examples/poem.py @@ -55,4 +55,4 @@ class Poems(BaseModel): # 0 Dreams vs. reality In the realm where dreams take flight,\nWhere ... # 1 Dreams vs. reality Reality stands with open eyes,\nA weighty thro... # 2 Urban loneliness in a bustling city In the city's heart where shadows blend,\nAmon... -# 3 Urban loneliness in a bustling city Among the crowds, I walk alone,\nA sea of face... \ No newline at end of file +# 3 Urban loneliness in a bustling city Among the crowds, I walk alone,\nA sea of face... diff --git a/src/bespokelabs/curator/prompter/prompter.py b/src/bespokelabs/curator/prompter/prompter.py index 63710b28..a8afa2da 100644 --- a/src/bespokelabs/curator/prompter/prompter.py +++ b/src/bespokelabs/curator/prompter/prompter.py @@ -51,6 +51,8 @@ def __init__( batch_size: Optional[int] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, + presence_penalty: Optional[float] = None, + frequency_penalty: Optional[float] = None, ): """Initialize a Prompter. @@ -64,6 +66,10 @@ def __init__( response format from the LLM. batch (bool): Whether to use batch processing batch_size (Optional[int]): The size of the batch to use, only used if batch is True + temperature (Optional[float]): The temperature to use for the LLM, only used if batch is False + top_p (Optional[float]): The top_p to use for the LLM, only used if batch is False + presence_penalty (Optional[float]): The presence_penalty to use for the LLM, only used if batch is False + frequency_penalty (Optional[float]): The frequency_penalty to use for the LLM, only used if batch is False """ prompt_sig = inspect.signature(prompt_func) if len(prompt_sig.parameters) > 1: @@ -93,6 +99,8 @@ def __init__( batch_size=batch_size, temperature=temperature, top_p=top_p, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, ) else: if batch_size is not None: @@ -100,7 +108,11 @@ def __init__( f"Prompter argument `batch_size` {batch_size} is ignored because `batch` is False" ) self._request_processor = OpenAIOnlineRequestProcessor( - model=model_name, temperature=temperature, top_p=top_p + model=model_name, + temperature=temperature, + top_p=top_p, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, ) def __call__( diff --git a/src/bespokelabs/curator/request_processor/openai_batch_request_processor.py b/src/bespokelabs/curator/request_processor/openai_batch_request_processor.py index 64027b72..ca1520ed 100644 --- a/src/bespokelabs/curator/request_processor/openai_batch_request_processor.py +++ b/src/bespokelabs/curator/request_processor/openai_batch_request_processor.py @@ -35,6 +35,8 @@ def __init__( check_interval: int = 10, api_key: str = os.getenv("OPENAI_API_KEY"), url: str = "https://api.openai.com/v1/chat/completions", + presence_penalty: float | None = None, + frequency_penalty: float | None = None, ): if batch_size > MAX_REQUESTS_PER_BATCH: raise ValueError( @@ -48,6 +50,8 @@ def __init__( self.check_interval: int = check_interval self.temperature: float | None = temperature self.top_p: float | None = top_p + self.presence_penalty: float | None = presence_penalty + self.frequency_penalty: float | None = frequency_penalty def get_rate_limits(self) -> dict: """ @@ -132,6 +136,12 @@ def create_api_specific_request( if self.top_p is not None: body["top_p"] = self.top_p + if self.presence_penalty is not None: + body["presence_penalty"] = self.presence_penalty + + if self.frequency_penalty is not None: + body["frequency_penalty"] = self.frequency_penalty + request = { "custom_id": str(generic_request.original_row_idx), "method": "POST", diff --git a/src/bespokelabs/curator/request_processor/openai_online_request_processor.py b/src/bespokelabs/curator/request_processor/openai_online_request_processor.py index 379d3fe3..1a199f1c 100644 --- a/src/bespokelabs/curator/request_processor/openai_online_request_processor.py +++ b/src/bespokelabs/curator/request_processor/openai_online_request_processor.py @@ -36,6 +36,8 @@ def __init__( url: str = "https://api.openai.com/v1/chat/completions", temperature: Optional[float] = None, top_p: Optional[float] = None, + presence_penalty: Optional[float] = None, + frequency_penalty: Optional[float] = None, ): super().__init__(batch_size=None) self.model: str = model @@ -43,6 +45,8 @@ def __init__( self.api_key: str = api_key self.temperature: float = temperature self.top_p: float = top_p + self.presence_penalty: float = presence_penalty + self.frequency_penalty: float = frequency_penalty def get_rate_limits(self) -> dict: """ @@ -117,6 +121,12 @@ def create_api_specific_request( if self.top_p is not None: request["top_p"] = self.top_p + if self.presence_penalty is not None: + request["presence_penalty"] = self.presence_penalty + + if self.frequency_penalty is not None: + request["frequency_penalty"] = self.frequency_penalty + return request def run( From 386fcdf5f785ba363ce734b1c8f7e9cc4a0e6c78 Mon Sep 17 00:00:00 2001 From: Charlie Cheng-Jie Ji Date: Sun, 17 Nov 2024 23:55:25 +0000 Subject: [PATCH 4/8] add system prompt in detailed view --- .../dataset-viewer/DetailsSidebar.tsx | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx b/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx index 8a3865f7..02d5fcd5 100644 --- a/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx +++ b/bespoke-dataset-viewer/components/dataset-viewer/DetailsSidebar.tsx @@ -45,13 +45,33 @@ export function DetailsSidebar({ item, onClose }: DetailsSidebarProps) {

{item.generic_request.model}

+ {item.generic_request.messages.some(m => m.role === "system") && ( + <> +
+

System Prompt

+

+ {item.generic_request.messages.find(m => m.role === "system")?.content} +

+ +
+ + + )}

User Message

{item.generic_request.messages.find(m => m.role === "user")?.content}