From 627a09cf0c9bcc2f824dd941310d03fc25bab46c Mon Sep 17 00:00:00 2001 From: Neeraj Pradhan Date: Fri, 17 Jan 2025 14:59:52 -0800 Subject: [PATCH] Add schema restrictions to the README (#44) --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 9775a4d..874123b 100644 --- a/README.md +++ b/README.md @@ -68,13 +68,23 @@ schema = { "name": {"type": "string", "description": "Candidate name"}, "experience": { "type": "array", + "description": "Work history", "items": { "type": "object", "properties": { - "company": {"type": "string"}, - "title": {"type": "string"}, - "start_date": {"type": "string"}, - "end_date": {"type": "string"}, + "company": { + "type": "string", + "description": "Company name", + }, + "title": {"type": "string", "description": "Job title"}, + "start_date": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "description": "Start date of employment", + }, + "end_date": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "description": "End date of employment", + }, }, }, }, @@ -84,6 +94,22 @@ schema = { agent = extractor.create_agent(name="resume-parser", data_schema=schema) ``` +### Important restrictions on JSON/Pydantic Schema + +*LlamaExtract only supports a subset of the JSON Schema specification.* While limited, it should +be sufficient for a wide variety of use-cases. + + - All fields are required by default. Nullable fields must be explicitly marked as such, + using `"anyOf"` with a `"null"` type. See `"start_date"` field above. + - Root node must be of type `"object"`. + - Schema nesting must be limited to within 5 levels. + - The important fields are key names/titles, type and description. Fields for + formatting, default values, etc. are not supported. + - There are other restrictions on number of keys, size of the schema, etc. that you may + hit for complex extraction use cases. In such cases, it is worth thinking how to restructure + your extraction workflow to fit within these constraints, e.g. by extracting subset of fields + and later merging them together. + ## Other Extraction APIs ### Batch Processing @@ -137,13 +163,20 @@ pip install llama-extract==0.1.0 ## Tips & Best Practices 1. **Schema Design**: - - Make fields optional when data might not always be present. - - Use descriptive field names and detailed descriptions. Use descriptions to pass formatting instructions or few-shot examples. - - Start simple and iterate on schema complexity. + - Try to limit schema nesting to 3-4 levels. + - Make fields optional when data might not always be present. Having required fields may force the model + to hallucinate when these fields are not present in the documents. + - When you want to extract a variable number of entities, use an `array` type. Note that you cannot use + an `array` type for the root node. + - Use descriptive field names and detailed descriptions. Use descriptions to pass formatting + instructions or few-shot examples. + - Start simple and iteratively build your schema to incorporate requirements. 2. **Running Extractions**: - - Note that resetting `agent.schema` will not save the schema to the database, until you call `agent.save`, but it will be used for running extractions. - - Check job status prior to accessing results. Any extraction error should be available as part of `job.error` or `extraction_run.error` fields for debugging. + - Note that resetting `agent.schema` will not save the schema to the database, + until you call `agent.save`, but it will be used for running extractions. + - Check job status prior to accessing results. Any extraction error should be available as + part of `job.error` or `extraction_run.error` fields for debugging. - Consider async operations (`queue_extraction`) for large-scale extraction once you have finalized your schema. ## Additional Resources