From 627a09cf0c9bcc2f824dd941310d03fc25bab46c Mon Sep 17 00:00:00 2001
From: Neeraj Pradhan <neeraj@runllama.ai>
Date: Fri, 17 Jan 2025 14:59:52 -0800
Subject: [PATCH] Add schema restrictions to the README (#44)

---
 README.md | 51 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 9775a4d..874123b 100644
--- a/README.md
+++ b/README.md
@@ -68,13 +68,23 @@ schema = {
         "name": {"type": "string", "description": "Candidate name"},
         "experience": {
             "type": "array",
+            "description": "Work history",
             "items": {
                 "type": "object",
                 "properties": {
-                    "company": {"type": "string"},
-                    "title": {"type": "string"},
-                    "start_date": {"type": "string"},
-                    "end_date": {"type": "string"},
+                    "company": {
+                        "type": "string",
+                        "description": "Company name",
+                    },
+                    "title": {"type": "string", "description": "Job title"},
+                    "start_date": {
+                        "anyOf": [{"type": "string"}, {"type": "null"}],
+                        "description": "Start date of employment",
+                    },
+                    "end_date": {
+                        "anyOf": [{"type": "string"}, {"type": "null"}],
+                        "description": "End date of employment",
+                    },
                 },
             },
         },
@@ -84,6 +94,22 @@ schema = {
 agent = extractor.create_agent(name="resume-parser", data_schema=schema)
 ```
 
+### Important restrictions on JSON/Pydantic Schema
+
+*LlamaExtract only supports a subset of the JSON Schema specification.* While limited, it should
+be sufficient for a wide variety of use-cases.
+
+  - All fields are required by default. Nullable fields must be explicitly marked as such,
+  using `"anyOf"` with a `"null"` type. See `"start_date"` field above.
+  - Root node must be of type `"object"`.
+  - Schema nesting must be limited to within 5 levels.
+  - The important fields are key names/titles, type and description. Fields for
+  formatting, default values, etc. are not supported.
+  - There are other restrictions on number of keys, size of the schema, etc. that you may
+  hit for complex extraction use cases. In such cases, it is worth thinking how to restructure
+  your extraction workflow to fit within these constraints, e.g. by extracting subset of fields
+  and later merging them together.
+
 ## Other Extraction APIs
 
 ### Batch Processing
@@ -137,13 +163,20 @@ pip install llama-extract==0.1.0
 ## Tips & Best Practices
 
 1. **Schema Design**:
-   - Make fields optional when data might not always be present.
-   - Use descriptive field names and detailed descriptions. Use descriptions to pass formatting instructions or few-shot examples.
-   - Start simple and iterate on schema complexity.
+   - Try to limit schema nesting to 3-4 levels.
+   - Make fields optional when data might not always be present. Having required fields may force the model
+   to hallucinate when these fields are not present in the documents.
+   - When you want to extract a variable number of entities, use an `array` type. Note that you cannot use
+   an `array` type for the root node.
+   - Use descriptive field names and detailed descriptions. Use descriptions to pass formatting
+   instructions or few-shot examples.
+   - Start simple and iteratively build your schema to incorporate requirements.
 
 2. **Running Extractions**:
-   - Note that resetting `agent.schema` will not save the schema to the database, until you call `agent.save`, but it will be used for running extractions.
-   - Check job status prior to accessing results. Any extraction error should be available as part of `job.error` or `extraction_run.error` fields for debugging.
+   - Note that resetting `agent.schema` will not save the schema to the database,
+   until you call `agent.save`, but it will be used for running extractions.
+   - Check job status prior to accessing results. Any extraction error should be available as
+   part of `job.error` or `extraction_run.error` fields for debugging.
    - Consider async operations (`queue_extraction`) for large-scale extraction once you have finalized your schema.
 
 ## Additional Resources