diff --git a/.github/workflows/aws-genai-cicd-suite.yml b/.github/workflows/aws-genai-cicd-suite.yml
deleted file mode 100644
index b16c41b8..00000000
--- a/.github/workflows/aws-genai-cicd-suite.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-name: Intelligent Code Review
-# Enable manual trigger
-on:
-  workflow_dispatch:
-  pull_request:
-    types: [opened, synchronize]
-
-    # Avoid running the same workflow on the same branch concurrently
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-jobs:
-  review:
-    runs-on: ubuntu-latest
-    environment: AWS_ROLE_TO_ASSUME
-
-    permissions:
-      # read repository contents and write pull request comments
-      id-token: write
-      # allow github action bot to push new content into existing pull requests
-      contents: write
-      # contents: read
-      pull-requests: write
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Set up Node.js
-      uses: actions/setup-node@v3
-      with:
-        node-version: '20'
-
-    - name: Install dependencies @actions/core and @actions/github
-      run: |
-        npm install @actions/core
-        npm install @actions/github
-      shell: bash
-
-    # check if required dependencies @actions/core and @actions/github are installed
-    - name: Check if required dependencies are installed
-      run: |
-        npm list @actions/core
-        npm list @actions/github
-      shell: bash
-
-    - name: Debug GitHub Token and environment variables
-      run: |
-        if [ -n "${{ secrets.GITHUB_TOKEN }}" ]; then
-          echo "GitHub Token is set"
-        else
-          echo "GitHub Token is not set"
-        fi
-        echo "AWS_ROLE_TO_ASSUME: ${{ vars.AWS_ROLE_TO_ASSUME_VAR }}"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
deleted file mode 100644
index e272a55a..00000000
--- a/.github/workflows/release.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: release
-on:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'the reason for triggering this workflow'
-        required: false
-        default: 'manually publish the pre-built ecr images'
-jobs:
-  ecr_images:
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-      contents: read
-    env:
-      iam_role_to_assume: ${{ secrets.ROLE_ARN }}
-    steps:
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Configure AWS Credentials
-        if: ${{ env.iam_role_to_assume != '' }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: ${{ env.iam_role_to_assume }}
-          aws-region: us-east-1
-      - name: Build and Publish
-        run: |-
-          cd scripts
-          bash push-to-ecr.sh
-  cfn_templates:
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-    needs: ecr_images
-    env:
-      iam_role_to_assume: ${{ secrets.ROLE_ARN }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Configure AWS Credentials
-        if: ${{ env.iam_role_to_assume != '' }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: ${{ env.iam_role_to_assume }}
-          aws-region: us-east-1
-      - name: Copy Deployment Templates to S3
-        env:
-          S3_BUCKET: ${{ secrets.ASSET_BUCKET }}
-          S3_PREFIX: bedrock-access-gateway/latest/
-        run: aws s3 sync deployment/ s3://$S3_BUCKET/$S3_PREFIX --acl public-read
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index d8b355e7..23212b88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,4 +159,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
-Config
\ No newline at end of file
+Config
+.vscode/launch.json
diff --git a/README.md b/README.md
index c82f70d2..306990ea 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
-[中文](./README_CN.md)
-
 # Bedrock Access Gateway
 
 OpenAI-compatible RESTful APIs for Amazon Bedrock
 
 ## What's New 🔥
 
-This project supports reasoning for both **Claude 3.7 Sonnet** and **DeepSeek R1**, check [How to Use](./docs/Usage.md#reasoning) for more details. You need to first run the Models API to refresh the model list.
+This project now supports **Claude Sonnet 4.5**, Anthropic's most intelligent model with enhanced coding capabilities and complex agent support, available via global cross-region inference.
+
+It also supports reasoning for both **Claude 3.7 Sonnet** and **DeepSeek R1**. Check [How to Use](./docs/Usage.md#reasoning) for more details. You need to first run the Models API to refresh the model list.
 
 ## Overview
 
@@ -26,7 +26,10 @@ If you find this GitHub repository useful, please consider giving it a free star
 - [x] Support Embedding API
 - [x] Support Multimodal API
 - [x] Support Cross-Region Inference
+- [x] Support Application Inference Profiles (**new**)
 - [x] Support Reasoning (**new**)
+- [x] Support Interleaved thinking (**new**)
+- [x] Support Prompt Caching (**new**)
 
 Please check [Usage Guide](./docs/Usage.md) for more details about how to use the new APIs.
 
@@ -53,47 +56,75 @@ Alternatively, you can use Lambda Function URL to replace ALB, see [example](htt
 
 ### Deployment
 
-Please follow the steps below to deploy the Bedrock Proxy APIs into your AWS account. Only supports regions where Amazon Bedrock is available (such as `us-west-2`). The deployment will take approximately **3-5 minutes** 🕒.
+Please follow the steps below to deploy the Bedrock Proxy APIs into your AWS account. Only supports regions where Amazon Bedrock is available (such as `us-west-2`). The deployment will take approximately **10-15 minutes** 🕒.
 
 **Step 1: Create your own API key in Secrets Manager (MUST)**
 
-
 > **Note:** This step is to use any string (without spaces) you like to create a custom API Key (credential) that will be used to access the proxy API later. This key does not have to match your actual OpenAI key, and you don't need to have an OpenAI API key. please keep the key safe and private.
 
 1. Open the AWS Management Console and navigate to the AWS Secrets Manager service.
-2. Click on "Store a new secret" button. 
+2. Click on "Store a new secret" button.
 3. In the "Choose secret type" page, select:
 
    Secret type: Other type of secret
    Key/value pairs:
    - Key: api_key
    - Value: Enter your API key value
-   
+
    Click "Next"
 4. In the "Configure secret" page:
    Secret name: Enter a name (e.g., "BedrockProxyAPIKey")
    Description: (Optional) Add a description of your secret
 5. Click "Next" and review all your settings and click "Store"
 
-After creation, you'll see your secret in the Secrets Manager console.  Make note of the secret ARN.
+After creation, you'll see your secret in the Secrets Manager console. Make note of the secret ARN.
+
+**Step 2: Build and push container images to ECR**
+
+1. Clone this repository:
+   ```bash
+   git clone https://github.com/aws-samples/bedrock-access-gateway.git
+   cd bedrock-access-gateway
+   ```
+
+2. Run the build and push script:
+   ```bash
+   cd scripts
+   bash ./push-to-ecr.sh
+   ```
+
+3. Follow the prompts to configure:
+   - ECR repository names (or use defaults)
+   - Image tag (or use default: `latest`)
+   - AWS region (or use default: `us-east-1`)
+
+4. The script will build and push both Lambda and ECS/Fargate images to your ECR repositories.
+
+5. **Important**: Copy the image URIs displayed at the end of the script output. You'll need these in the next step.
 
+**Step 3: Deploy the CloudFormation stack**
 
-**Step 2: Deploy the CloudFormation stack**
+1. Download the CloudFormation template you want to use:
+   - For Lambda: [`deployment/BedrockProxy.template`](deployment/BedrockProxy.template)
+   - For Fargate: [`deployment/BedrockProxyFargate.template`](deployment/BedrockProxyFargate.template)
 
-1. Sign in to AWS Management Console, switch to the region to deploy the CloudFormation Stack to.
-2. Click the following button to launch the CloudFormation Stack in that region. Choose one of the following:
+2. Sign in to AWS Management Console and navigate to the CloudFormation service in your target region.
 
-      [<kbd> <br> ALB + Lambda 1-Click Deploy 🚀 <br> </kbd>](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://aws-gcr-solutions.s3.amazonaws.com/bedrock-access-gateway/latest/BedrockProxy.template&stackName=BedrockProxyAPI)
+3. Click "Create stack" → "With new resources (standard)".
+
+4. Upload the template file you downloaded.
+
+5. On the "Specify stack details" page, provide the following information:
+   - **Stack name**: Enter a stack name (e.g., "BedrockProxyAPI")
+   - **ApiKeySecretArn**: Enter the secret ARN from Step 1
+   - **ContainerImageUri**: Enter the ECR image URI from Step 2 output
+   - **DefaultModelId**: (Optional) Change the default model if needed
 
-      [<kbd> <br> ALB + Fargate 1-Click Deploy 🚀 <br> </kbd>](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://aws-gcr-solutions.s3.amazonaws.com/bedrock-access-gateway/latest/BedrockProxyFargate.template&stackName=BedrockProxyAPI)
-3. Click "Next".
-4. On the "Specify stack details" page, provide the following information:
-    - Stack name: Change the stack name if needed.
-    - ApiKeySecretArn: Enter the secret ARN you used for storing the API key. 
-   
    Click "Next".
-5. On the "Configure stack options" page, you can leave the default settings or customize them according to your needs. Click "Next".
-6. On the "Review" page, review the details of the stack you're about to create. Check the "I acknowledge that AWS CloudFormation might create IAM resources" checkbox at the bottom. Click "Create stack".
+
+6. On the "Configure stack options" page, you can leave the default settings or customize them according to your needs. Click "Next".
+
+7. On the "Review" page, review all details. Check the "I acknowledge that AWS CloudFormation might create IAM resources" checkbox at the bottom. Click "Submit".
 
 That is it! 🎉 Once deployed, click the CloudFormation stack and go to **Outputs** tab, you can find the API Base URL from `APIBaseUrl`, the value should look like `http://xxxx.xxx.elb.amazonaws.com/api/v1`.
 
@@ -103,7 +134,7 @@ If you encounter any issues, please check the [Troubleshooting Guide](./docs/Tro
 
 ### SDK/API Usage
 
-All you need is the API Key and the API Base URL. If you didn't set up your own key, then the default API Key (`bedrock`) will be used.
+All you need is the API Key and the API Base URL. If you didn't set up your own key following Step 1, the application will fail to start with an error message indicating that the API Key is not configured.
 
 Now, you can try out the proxy APIs. Let's say you want to test Claude 3 Sonnet model (model ID: `anthropic.claude-3-sonnet-20240229-v1:0`)...
 
@@ -148,7 +179,120 @@ print(completion.choices[0].message.content)
 
 Please check [Usage Guide](./docs/Usage.md) for more details about how to use embedding API, multimodal API and tool call.
 
+### Application Inference Profiles
+
+This proxy now supports **Application Inference Profiles**, which allow you to track usage and costs for your model invocations. You can use application inference profiles created in your AWS account for cost tracking and monitoring purposes.
+
+**Using Application Inference Profiles:**
+
+```bash
+# Use an application inference profile ARN as the model ID
+curl $OPENAI_BASE_URL/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "arn:aws:bedrock:us-west-2:123456789012:application-inference-profile/your-profile-id",
+    "messages": [
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ]
+  }'
+```
+
+**SDK Usage with Application Inference Profiles:**
 
+```python
+from openai import OpenAI
+
+client = OpenAI()
+completion = client.chat.completions.create(
+    model="arn:aws:bedrock:us-west-2:123456789012:application-inference-profile/your-profile-id",
+    messages=[{"role": "user", "content": "Hello!"}],
+)
+
+print(completion.choices[0].message.content)
+```
+
+**Benefits of Application Inference Profiles:**
+- **Cost Tracking**: Track usage and costs for specific applications or use cases
+- **Usage Monitoring**: Monitor model invocation metrics through CloudWatch
+- **Tag-based Cost Allocation**: Use AWS cost allocation tags for detailed billing analysis
+
+For more information about creating and managing application inference profiles, see the [Amazon Bedrock User Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/inference-profiles-create.html).
+
+### Prompt Caching
+
+This proxy now supports **Prompt Caching** for Claude and Nova models, which can reduce costs by up to 90% and latency by up to 85% for workloads with repeated prompts.
+
+**Supported Models:**
+- Claude 3+ models (Claude 3.5 Haiku, Claude 3.7 Sonnet, Claude 4, Claude 4.5, etc.)
+- Nova models (Nova Micro, Nova Lite, Nova Pro, Nova Premier)
+
+**Enabling Prompt Caching:**
+
+You can enable prompt caching in two ways:
+
+1. **Globally via Environment Variable** (set in ECS Task Definition or Lambda):
+```bash
+ENABLE_PROMPT_CACHING=true
+```
+
+2. **Per-request via `extra_body`** :
+
+**Python SDK:**
+```python
+from openai import OpenAI
+
+client = OpenAI()
+
+# Cache system prompts
+response = client.chat.completions.create(
+    model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    messages=[
+        {"role": "system", "content": "You are an expert assistant with knowledge of..."},
+        {"role": "user", "content": "Help me with this task"}
+    ],
+    extra_body={
+        "prompt_caching": {"system": True}
+    }
+)
+
+# Check cache hit
+if response.usage.prompt_tokens_details:
+    cached_tokens = response.usage.prompt_tokens_details.cached_tokens
+    print(f"Cached tokens: {cached_tokens}")
+```
+
+**cURL:**
+```bash
+curl $OPENAI_BASE_URL/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    "messages": [
+      {"role": "system", "content": "Long system prompt..."},
+      {"role": "user", "content": "Question"}
+    ],
+    "extra_body": {
+      "prompt_caching": {"system": true}
+    }
+  }'
+```
+
+**Cache Options:**
+- `"prompt_caching": {"system": true}` - Cache system prompts
+- `"prompt_caching": {"messages": true}` - Cache user messages
+- `"prompt_caching": {"system": true, "messages": true}` - Cache both
+
+**Requirements:**
+- Prompt must be ≥1,024 tokens to enable caching
+- Cache TTL is 5 minutes (resets on each cache hit)
+- Nova models have a 20,000 token caching limit
+
+For more information, see the [Amazon Bedrock Prompt Caching Guide](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html).
 
 ## Other Examples
 
@@ -204,12 +348,6 @@ Note that not all models are available in those regions.
 
 You can use the [Models API](./docs/Usage.md#models-api) to get/refresh a list of supported models in the current region.
 
-### Can I build and use my own ECR image
-
-Yes, you can clone the repo and build the container image by yourself (`src/Dockerfile`) and then push to your ECR repo. You can use `scripts/push-to-ecr.sh`
-
-Replace the repo url in the CloudFormation template before you deploy.
-
 ### Can I run this locally
 
 Yes, you can run this locally, e.g. run below command under `src` folder:
@@ -236,13 +374,7 @@ Fine-tuned models and models with Provisioned Throughput are currently not suppo
 
 ### How to upgrade?
 
-To use the latest features, you don't need to redeploy the CloudFormation stack. You simply need to pull the latest image.
-
-To do so, depends on which version you deployed:
-
-- **Lambda version**: Go to AWS Lambda console, find the Lambda function, then find and click the `Deploy new image` button and click save.
-- **Fargate version**: Go to ECS console, click the ECS cluster, go the `Tasks` tab, select the only task that is running and simply click `Stop selected` menu. A new task with latest image will start automatically.
-
+To use the latest features, you need follow the deployment guide to redeploy the application. You can upgrade the existing CloudFormation stack to get the latest changes.
 
 ## Security
 
diff --git a/README_CN.md b/README_CN.md
deleted file mode 100644
index 8afe1a80..00000000
--- a/README_CN.md
+++ /dev/null
@@ -1,252 +0,0 @@
-[English](./README.md)
-
-# Bedrock Access Gateway
-
-使用兼容OpenAI的API访问Amazon Bedrock
-
-## 新功能 🔥
-
-本项目支持 **Claude 3.7 Sonnet** 和 **DeepSeek R1** 的推理（Reasoning）功能，查看[使用方法](./docs/Usage_CN.md#reasoning) 获取更多详情。首次使用你需要先运行下Models API 来刷新model列表。
-
-
-## 概述
-
-Amazon Bedrock提供了广泛的基础模型(如Claude 3 Opus/Sonnet/Haiku、Llama 2/3、Mistral/Mixtral等),以及构建生成式AI应用程序的多种功能。更多详细信息,请查看[Amazon
-Bedrock](https://aws.amazon.com/bedrock)。
-
-有时,您可能已经使用OpenAI的API或SDK构建了应用程序,并希望在不修改代码的情况下试用Amazon
-Bedrock的模型。或者,您可能只是希望在AutoGen等工具中评估这些基础模型的功能。 好消息是, 这里提供了一种方便的途径,让您可以使用
-OpenAI 的 API 或 SDK 无缝集成并试用 Amazon Bedrock 的模型,而无需对现有代码进行修改。
-
-如果您觉得这个项目有用,请考虑给它点个一个免费的小星星 ⭐。
-
-功能列表：
-
-- [x] 支持 server-sent events (SSE)的流式响应
-- [x] 支持 Model APIs
-- [x] 支持 Chat Completion APIs
-- [x] 支持 Tool Call
-- [x] 支持 Embedding API
-- [x] 支持 Multimodal API
-- [x] 支持 Cross-Region Inference
-- [x] 支持 Reasoning Mode (**new**)
-
-请查看[使用指南](./docs/Usage_CN.md)以获取有关如何使用新API的更多详细信息。
-
-## 使用指南
-
-### 前提条件
-
-请确保您已满足以下先决条件:
-
-- 可以访问Amazon Bedrock基础模型。
-
-如果您还没有获得模型访问权限,请参考[配置](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html)指南。
-
-### 架构图
-
-下图展示了本方案的参考架构。请注意,它还包括一个新的**VPC**,其中只有两个公共子网用于应用程序负载均衡器(ALB)。
-
-![Architecture](assets/arch.png)
-
-您也可以选择在 ALB 后面接 [AWS Fargate](https://aws.amazon.com/fargate/) 而不是 [AWS Lambda](https://aws.amazon.com/lambda/)，主要区别在于流响应的首字节延迟（Fargate更低）。
-
-或者,您可以使用 Lambda Function URL 来代替 ALB,请参阅[示例](https://github.com/awslabs/aws-lambda-web-adapter/tree/main/examples/fastapi-response-streaming)
-
-### 部署
-
-请按以下步骤将Bedrock代理API部署到您的AWS账户中。仅支持Amazon Bedrock可用的区域(如us-west-2)。 部署预计用时**3-5分钟** 🕒。
-
-**第一步: 在 Secrets Manager 中创建您的 API 密钥（必须）**
-
-> 注意:这一步是使用任意字符串（不带空格）创建一个自定义的API Key(凭证),将用于后续访问代理API。此API Key不必与您实际的OpenAI
-> Key一致,您甚至无需拥有OpenAI API Key。请确保保管好此API Key。
-
-1. 打开 AWS 管理控制台并导航至 AWS Secrets Manager 服务。
-2. 点击 "存储新密钥" 按钮。
-3. 在 "选择密钥类型" 页面，选择：
-
-   密钥类型：其他类型的密钥 键/值对：
-   
-   - 键：api_key
-   - 值：输入您的 API 密钥值
-   点击 "下一步"
-4. 在 "配置密钥" 页面： 密钥名称：输入一个名称（例如："BedrockProxyAPIKey"） 描述：（可选）添加密钥的描述
-5. 点击 "下一步"，检查所有设置后点击 "存储"
-
-创建完成后，您将在 Secrets Manager 控制台中看到您的密钥。请记下密钥的 ARN。
-
-**第二步: 部署CloudFormation堆栈**
-
-1. 登录AWS管理控制台,切换到要部署CloudFormation堆栈的区域。
-2. 单击以下按钮在该区域启动CloudFormation堆栈，选择一种方式部署。
-   - **ALB + Lambda**
-
-      [![Launch Stack](assets/launch-stack.png)](https://console.aws.amazon.com/cloudformation/home#/stacks/create/template?stackName=BedrockProxyAPI&templateURL=https://aws-gcr-solutions.s3.amazonaws.com/bedrock-access-gateway/latest/BedrockProxy.template)
-
-   - **ALB + Fargate**
-
-      [![Launch Stack](assets/launch-stack.png)](https://console.aws.amazon.com/cloudformation/home#/stacks/create/template?stackName=BedrockProxyAPI&templateURL=https://aws-gcr-solutions.s3.amazonaws.com/bedrock-access-gateway/latest/BedrockProxyFargate.template)
-3. 单击"下一步"。
-4. 在"指定堆栈详细信息"页面,提供以下信息:
-    - 堆栈名称: 可以根据需要更改名称。
-    - ApiKeySecretArn:输入您用于存储API 密钥的ARN。
-    
-   单击"下一步"。
-5. 在"配置堆栈选项"页面,您可以保留默认设置或根据需要进行自定义。 单击"下一步"。
-6. 在"审核"页面,查看您即将创建的堆栈详细信息。勾选底部的"我确认，AWS CloudFormation 可能创建 IAM 资源。"复选框。 单击"创建堆栈"。
-
-仅此而已 🎉 。部署完成后,点击CloudFormation堆栈,进入"输出"选项卡,你可以从"APIBaseUrl"
-中找到API Base URL,它应该类似于`http://xxxx.xxx.elb.amazonaws.com/api/v1` 这样的格式。
-
-### SDK/API使用
-
-你只需要API Key和API Base URL。如果你没有设置自己的密钥,那么默认将使用API Key `bedrock`。
-
-现在,你可以尝试使用代理API了。假设你想测试Claude 3 Sonnet模型,那么使用"anthropic.claude-3-sonnet-20240229-v1:0"作为模型ID。
-
-- **API 使用示例**
-
-```bash
-export OPENAI_API_KEY=<API key>
-export OPENAI_BASE_URL=<API base url>
-# 旧版本请使用OPENAI_API_BASE
-# https://github.com/openai/openai-python/issues/624
-export OPENAI_API_BASE=<API base url>
-```
-
-```bash
-curl $OPENAI_BASE_URL/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer $OPENAI_API_KEY" \
-  -d '{
-    "model": "anthropic.claude-3-sonnet-20240229-v1:0",
-    "messages": [
-      {
-        "role": "user",
-        "content": "Hello!"
-      }
-    ]
-  }'
-```
-
-- **SDK 使用示例**
-
-```python
-from openai import OpenAI
-
-client = OpenAI()
-completion = client.chat.completions.create(
-    model="anthropic.claude-3-sonnet-20240229-v1:0",
-    messages=[{"role": "user", "content": "Hello!"}],
-)
-
-print(completion.choices[0].message.content)
-```
-
-请查看[使用指南](./docs/Usage_CN.md)以获取有关如何使用Embedding API、多模态API和Tool Call的更多详细信息。
-
-
-
-## 其他例子
-
-### LangChain
-
-请确保使用的示`ChatOpenAI(...)` ，而不是`OpenAI(...)`
-
-```python
-# pip install langchain-openai
-import os
-
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-from langchain_openai import ChatOpenAI
-
-chat = ChatOpenAI(
-    model="anthropic.claude-3-sonnet-20240229-v1:0",
-    temperature=0,
-    openai_api_key=os.environ['OPENAI_API_KEY'],
-    openai_api_base=os.environ['OPENAI_BASE_URL'],
-)
-
-template = """Question: {question}
-
-Answer: Let's think step by step."""
-
-prompt = PromptTemplate.from_template(template)
-llm_chain = LLMChain(prompt=prompt, llm=chat)
-
-question = "What NFL team won the Super Bowl in the year Justin Beiber was born?"
-response = llm_chain.invoke(question)
-print(response)
-
-```
-
-## FAQs
-
-### 关于隐私
-
-这个方案不会收集您的任何数据。而且,它默认情况下也不会记录任何请求或响应。
-
-### 为什么没有使用API Gateway 而是使用了Application Load Balancer?
-
-简单的答案是API Gateway不支持 server-sent events (SSE) 用于流式响应。
-
-### 支持哪些区域?
-
-通常来说，所有Amazon Bedrock支持的区域都支持，如果不支持，请提个Github Issue。
-
-注意，并非所有模型都在上面区可用。
-
-### 支持哪些模型?
-
-你可以通过[Model API](./docs/Usage_CN.md#models-api) 获取（或更新）当前区支持的模型列表。 
-
-### 我可以构建并使用自己的ECR镜像吗?
-
-是的,你可以克隆repo并自行构建容器镜像(src/Dockerfile),然后推送到你自己的ECR仓库。 脚本可以参考`scripts/push-to-ecr.sh`。
-
-在部署之前,请在CloudFormation模板中替换镜像仓库URL。
-
-### 我可以在本地运行吗?
-
-是的,你可以在本地运行, 例如在`src` 文件夹下运行：
-
-```bash
-uvicorn api.app:app --host 0.0.0.0 --port 8000
-```
-
-那么API Base URL应该类似于`http://localhost:8000/api/v1`
-
-### 使用代理API会有任何性能牺牲或延迟增加吗?
-
-与 AWS SDK 调用相比,本方案参考架构会在响应上会有额外的延迟,你可以自己部署并测试。
-
-另外,你也可以使用 Lambda Web Adapter + Function URL (
-参见 [示例](https://github.com/awslabs/aws-lambda-web-adapter/tree/main/examples/fastapi-response-streaming))来代替 ALB
-或使用 AWS Fargate 来代替 Lambda,以获得更好的流响应性能。
-
-### 有计划支持SageMaker模型吗?
-
-目前没有支持SageMaker模型的计划。这取决于是否有客户需求。
-
-### 有计划支持Bedrock自定义模型吗?
-
-不支持微调模型和设置了已预配吞吐量的模型。如有需要,你可以克隆repo并进行自定义。
-
-### 如何升级?
-
-要使用最新功能,您无需重新部署CloudFormation堆栈。您只需拉取最新的镜像即可。
-
-具体操作方式取决于您部署的版本:
-
-- **Lambda版本**: 进入AWS Lambda控制台,找到Lambda 函数，然后找到并单击`部署新映像`按钮,然后单击保存。
-- **Fargate版本**: 进入ECS控制台,单击ECS集群,转到`任务`选项卡,选择正在运行的唯一任务,然后点击`停止所选`菜单, ECS会自动启动新任务并且使用最新镜像。
-
-## 安全
-
-更多信息,请参阅[CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications)。
-
-## 许可证
-
-本项目根据MIT-0许可证获得许可。请参阅LICENSE文件。
diff --git a/THIRD_PARTY b/THIRD_PARTY
new file mode 100644
index 00000000..702c9506
--- /dev/null
+++ b/THIRD_PARTY
@@ -0,0 +1,8 @@
+certifi
+
+SPDX-License-Identifier: MPL-2.0
+This Source Code Form is subject to the terms of the Mozilla Public
+License, v. 2.0. If a copy of the MPL was not distributed with this
+file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+https://github.com/certifi/python-certifi
\ No newline at end of file
diff --git a/assets/launch-stack.png b/assets/launch-stack.png
deleted file mode 100644
index 2745adf4..00000000
Binary files a/assets/launch-stack.png and /dev/null differ
diff --git a/deployment/BedrockProxy.template b/deployment/BedrockProxy.template
index 17387dfb..1b15de43 100644
--- a/deployment/BedrockProxy.template
+++ b/deployment/BedrockProxy.template
@@ -4,10 +4,20 @@ Parameters:
     Type: String
     AllowedPattern: ^arn:aws:secretsmanager:.*$
     Description: The secret ARN in Secrets Manager used to store the API Key
+  ContainerImageUri:
+    Type: String
+    Description: The ECR image URI for the Lambda function (e.g., 123456789012.dkr.ecr.us-east-1.amazonaws.com/bedrock-proxy-api:latest)
   DefaultModelId:
     Type: String
     Default: anthropic.claude-3-sonnet-20240229-v1:0
     Description: The default model ID, please make sure the model ID is supported in the current region
+  EnablePromptCaching:
+    Type: String
+    Default: "false"
+    AllowedValues:
+      - "true"
+      - "false"
+    Description: Enable prompt caching for supported models (Claude, Nova). When enabled, adds cachePoint to system prompts and messages for cost savings.
 Resources:
   VPCB9E5F0B4:
     Type: AWS::EC2::VPC
@@ -151,6 +161,7 @@ Resources:
             Resource:
               - arn:aws:bedrock:*::foundation-model/*
               - arn:aws:bedrock:*:*:inference-profile/*
+              - arn:aws:bedrock:*:*:application-inference-profile/*
           - Action:
               - secretsmanager:GetSecretValue
               - secretsmanager:DescribeSecret
@@ -168,13 +179,7 @@ Resources:
         - arm64
       Code:
         ImageUri:
-          Fn::Join:
-            - ""
-            - - 366590864501.dkr.ecr.
-              - Ref: AWS::Region
-              - "."
-              - Ref: AWS::URLSuffix
-              - /bedrock-proxy-api:latest
+          Ref: ContainerImageUri
       Description: Bedrock Proxy API Handler
       Environment:
         Variables:
@@ -185,6 +190,9 @@ Resources:
             Ref: DefaultModelId
           DEFAULT_EMBEDDING_MODEL: cohere.embed-multilingual-v3
           ENABLE_CROSS_REGION_INFERENCE: "true"
+          ENABLE_APPLICATION_INFERENCE_PROFILES: "true"
+          ENABLE_PROMPT_CACHING:
+            Ref: EnablePromptCaching
       MemorySize: 1024
       PackageType: Image
       Role:
diff --git a/deployment/BedrockProxyFargate.template b/deployment/BedrockProxyFargate.template
index bae785cc..4fee3edf 100644
--- a/deployment/BedrockProxyFargate.template
+++ b/deployment/BedrockProxyFargate.template
@@ -4,10 +4,20 @@ Parameters:
     Type: String
     AllowedPattern: ^arn:aws:secretsmanager:.*$
     Description: The secret ARN in Secrets Manager used to store the API Key
+  ContainerImageUri:
+    Type: String
+    Description: The ECR image URI for the ECS/Fargate task (e.g., 123456789012.dkr.ecr.us-east-1.amazonaws.com/bedrock-proxy-api-ecs:latest)
   DefaultModelId:
     Type: String
     Default: anthropic.claude-3-sonnet-20240229-v1:0
     Description: The default model ID, please make sure the model ID is supported in the current region
+  EnablePromptCaching:
+    Type: String
+    Default: "false"
+    AllowedValues:
+      - "true"
+      - "false"
+    Description: Enable prompt caching for supported models (Claude, Nova). When enabled, adds cachePoint to system prompts and messages for cost savings.
 Resources:
   VPCB9E5F0B4:
     Type: AWS::EC2::VPC
@@ -134,10 +144,6 @@ Resources:
       PolicyDocument:
         Statement:
           - Action:
-              - ecr:GetAuthorizationToken
-              - ecr:BatchCheckLayerAvailability
-              - ecr:GetDownloadUrlForLayer
-              - ecr:BatchGetImage
               - logs:CreateLogStream
               - logs:PutLogEvents
             Effect: Allow
@@ -157,8 +163,35 @@ Resources:
               Fn::Join:
                 - ""
                 - - "arn:aws:ecr:"
-                  - Ref: AWS::Region
-                  - :366590864501:repository/bedrock-proxy-api-ecs
+                  - Fn::Select:
+                      - 3
+                      - Fn::Split:
+                          - "."
+                          - Fn::Select:
+                              - 0
+                              - Fn::Split:
+                                  - "/"
+                                  - Ref: ContainerImageUri
+                  - ":"
+                  - Fn::Select:
+                      - 0
+                      - Fn::Split:
+                          - "."
+                          - Fn::Select:
+                              - 0
+                              - Fn::Split:
+                                  - "/"
+                                  - Ref: ContainerImageUri
+                  - ":repository/"
+                  - Fn::Select:
+                      - 0
+                      - Fn::Split:
+                          - ":"
+                          - Fn::Select:
+                              - 1
+                              - Fn::Split:
+                                  - "/"
+                                  - Ref: ContainerImageUri
           - Action: ecr:GetAuthorizationToken
             Effect: Allow
             Resource: "*"
@@ -193,6 +226,7 @@ Resources:
             Resource:
               - arn:aws:bedrock:*::foundation-model/*
               - arn:aws:bedrock:*:*:inference-profile/*
+              - arn:aws:bedrock:*:*:application-inference-profile/*
         Version: "2012-10-17"
       PolicyName: ProxyTaskRoleDefaultPolicy933321B8
       Roles:
@@ -222,19 +256,18 @@ Resources:
               Value: cohere.embed-multilingual-v3
             - Name: ENABLE_CROSS_REGION_INFERENCE
               Value: "true"
+            - Name: ENABLE_APPLICATION_INFERENCE_PROFILES
+              Value: "true"
+            - Name: ENABLE_PROMPT_CACHING
+              Value:
+                Ref: EnablePromptCaching
           Essential: true
           Image:
-            Fn::Join:
-              - ""
-              - - 366590864501.dkr.ecr.
-                - Ref: AWS::Region
-                - "."
-                - Ref: AWS::URLSuffix
-                - /bedrock-proxy-api-ecs:latest
+            Ref: ContainerImageUri
           Name: proxy-api
           PortMappings:
-            - ContainerPort: 80
-              HostPort: 80
+            - ContainerPort: 8080
+              HostPort: 8080
               Protocol: tcp
           Secrets:
             - Name: API_KEY
@@ -280,7 +313,7 @@ Resources:
       HealthCheckGracePeriodSeconds: 60
       LoadBalancers:
         - ContainerName: proxy-api
-          ContainerPort: 80
+          ContainerPort: 8080
           TargetGroupArn:
             Ref: ProxyALBListenerTargetsGroup187739FA
       NetworkConfiguration:
@@ -317,7 +350,7 @@ Resources:
     Type: AWS::EC2::SecurityGroupIngress
     Properties:
       Description: Load balancer to target
-      FromPort: 80
+      FromPort: 8080
       GroupId:
         Fn::GetAtt:
           - ProxyApiServiceSecurityGroup51EBD9B8
@@ -327,7 +360,7 @@ Resources:
         Fn::GetAtt:
           - ProxyALBSecurityGroup0D6CA3DA
           - GroupId
-      ToPort: 80
+      ToPort: 8080
     DependsOn:
       - ProxyTaskRoleDefaultPolicy933321B8
       - ProxyTaskRole5DB6A540
@@ -373,13 +406,13 @@ Resources:
         Fn::GetAtt:
           - ProxyApiServiceSecurityGroup51EBD9B8
           - GroupId
-      FromPort: 80
+      FromPort: 8080
       GroupId:
         Fn::GetAtt:
           - ProxyALBSecurityGroup0D6CA3DA
           - GroupId
       IpProtocol: tcp
-      ToPort: 80
+      ToPort: 8080
   ProxyALBListener933E9515:
     Type: AWS::ElasticLoadBalancingV2::Listener
     Properties:
@@ -398,7 +431,7 @@ Resources:
       HealthCheckIntervalSeconds: 60
       HealthCheckPath: /health
       HealthCheckTimeoutSeconds: 30
-      Port: 80
+      Port: 8080
       Protocol: HTTP
       TargetGroupAttributes:
         - Key: stickiness.enabled
diff --git a/docs/Usage.md b/docs/Usage.md
index 822a870c..c9d003cd 100644
--- a/docs/Usage.md
+++ b/docs/Usage.md
@@ -15,6 +15,7 @@ export OPENAI_BASE_URL=<API base url>
 - [Multimodal API](#multimodal-api)
 - [Tool Call](#tool-call)
 - [Reasoning](#reasoning)
+- [Interleaved thinking (beta)](#Interleaved thinking (beta))
 
 ## Models API
 
@@ -50,6 +51,43 @@ curl -s $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | jq
 ]
 ```
 
+## Chat Completions API
+
+### Basic Example with Claude Sonnet 4.5
+
+Claude Sonnet 4.5 is Anthropic's most intelligent model, excelling at coding, complex reasoning, and agent-based tasks. It's available via global cross-region inference profiles.
+
+**Example Request**
+
+```bash
+curl $OPENAI_BASE_URL/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+    "messages": [
+      {
+        "role": "user",
+        "content": "Write a Python function to calculate the Fibonacci sequence using dynamic programming."
+      }
+    ]
+  }'
+```
+
+**Example SDK Usage**
+
+```python
+from openai import OpenAI
+
+client = OpenAI()
+completion = client.chat.completions.create(
+    model="global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+    messages=[{"role": "user", "content": "Write a Python function to calculate the Fibonacci sequence using dynamic programming."}],
+)
+
+print(completion.choices[0].message.content)
+```
+
 ## Embedding API
 
 **Important Notice**: Please carefully review the following points before using this proxy API for embedding.
@@ -135,6 +173,7 @@ print(doc_result[0][:5])
 **Example Request**
 
 ```bash
+curl $OPENAI_BASE_URL/chat/completions \
 curl $OPENAI_BASE_URL/chat/completions \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer $OPENAI_API_KEY" \
@@ -340,7 +379,6 @@ curl $OPENAI_BASE_URL/chat/completions \
   -d '{
     "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
     "messages": [
-        {
             "role": "user",
             "content": "which one is bigger, 3.9 or 3.11?"
         }
@@ -441,4 +479,97 @@ for chunk in response:
         reasoning_content += chunk.choices[0].delta.reasoning_content
     elif chunk.choices[0].delta.content:
         content += chunk.choices[0].delta.content
-```
\ No newline at end of file
+```
+
+## Interleaved thinking (beta)
+
+**Important Notice**: Please carefully review the following points before using reasoning mode for Chat completion API.
+
+Extended thinking with tool use in Claude 4 models supports [interleaved thinking](https://docs.aws.amazon.com/bedrock/latest/userguide/claude-messages-extended-thinking.html#claude-messages-extended-thinking-tool-use-interleaved) enables Claude 4 models to think between tool calls and run more sophisticated reasoning after receiving tool results. which is helpful for more complex agentic interactions.
+With interleaved thinking, the `budget_tokens` can exceed the `max_tokens` parameter because it represents the total budget across all thinking blocks within one assistant turn.
+
+**Supported Models**: Claude Sonnet 4, Claude Sonnet 4.5
+
+**Example Request**
+
+- Non-Streaming (Claude Sonnet 4.5)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "Explain how to implement a binary search tree with self-balancing capabilities."
+}],
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
+
+- Non-Streaming (Claude Sonnet 4)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
+}],
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
+
+- Streaming (Claude Sonnet 4.5)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "Explain how to implement a binary search tree with self-balancing capabilities."
+}],
+"stream": true,
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
+
+- Streaming (Claude Sonnet 4)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
+}],
+"stream": true,
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
diff --git a/docs/Usage_CN.md b/docs/Usage_CN.md
index c541e195..985f51bc 100644
--- a/docs/Usage_CN.md
+++ b/docs/Usage_CN.md
@@ -15,6 +15,8 @@ export OPENAI_BASE_URL=<API base url>
 - [Multimodal API](#multimodal-api)
 - [Tool Call](#tool-call)
 - [Reasoning](#reasoning)
+- [Interleaved thinking (beta)](#Interleaved thinking (beta))
+
 
 ## Models API
 
@@ -47,6 +49,42 @@ curl -s $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | jq
 ]
 ```
 
+## Chat Completions API
+
+### Claude Sonnet 4.5 基础示例
+
+Claude Sonnet 4.5 是 Anthropic 最智能的模型，在编码、复杂推理和基于代理的任务方面表现出色。它通过全球跨区域推理配置文件提供。
+
+**Request 示例**
+
+```bash
+curl $OPENAI_BASE_URL/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+    "messages": [
+      {
+        "role": "user",
+        "content": "编写一个使用动态规划计算斐波那契数列的Python函数。"
+      }
+    ]
+  }'
+```
+
+**SDK 使用示例**
+
+```python
+from openai import OpenAI
+
+client = OpenAI()
+completion = client.chat.completions.create(
+    model="global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+    messages=[{"role": "user", "content": "编写一个使用动态规划计算斐波那契数列的Python函数。"}],
+)
+
+print(completion.choices[0].message.content)
+```
 
 ## Embedding API
 
@@ -440,4 +478,98 @@ for chunk in response:
         reasoning_content += chunk.choices[0].delta.reasoning_content
     elif chunk.choices[0].delta.content:
         content += chunk.choices[0].delta.content
-```
\ No newline at end of file
+```
+
+## Interleaved thinking (beta)
+
+**重要提示**：在使用 Chat Completion API 的推理模式（reasoning mode）前，请务必仔细阅读以下内容。
+
+Claude 4 模型支持借助工具使用的扩展思维功能（Extended Thinking），其中包含交错思考（[interleaved thinking](https://docs.aws.amazon.com/bedrock/latest/userguide/claude-messages-extended-thinking.html#claude-messages-extended-thinking-tool-use-interleaved) ）。该功能使 Claude 4 可以在多次调用工具之间进行思考，并在收到工具结果后执行更复杂的推理，这对处理更复杂的 Agentic AI 交互非常有帮助。
+
+在交错思考模式下，budget_tokens 可以超过 max_tokens 参数，因为它代表一次助手回合中所有思考块的总 Token 预算。
+
+**支持的模型**: Claude Sonnet 4, Claude Sonnet 4.5
+
+**Request 示例**
+
+- Non-Streaming (Claude Sonnet 4.5)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "解释如何实现一个具有自平衡功能的二叉搜索树。"
+}],
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
+
+- Non-Streaming (Claude Sonnet 4)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
+}],
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
+
+- Streaming (Claude Sonnet 4.5)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "解释如何实现一个具有自平衡功能的二叉搜索树。"
+}],
+"stream": true,
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
+
+- Streaming (Claude Sonnet 4)
+
+```bash
+curl http://127.0.0.1:8000/api/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer bedrock" \
+-d '{
+"model": "us.anthropic.claude-sonnet-4-20250514-v1:0",
+"max_tokens": 2048,
+"messages": [{
+"role": "user",
+"content": "有一天，一个女孩参加数学考试只得了 38 分。她心里对父亲的惩罚充满恐惧，于是偷偷把分数改成了 88 分。她的父亲看到试卷后，怒发冲冠，狠狠地给了她一巴掌，怒吼道：“你这 8 怎么一半是绿的一半是红的，你以为我是傻子吗？”女孩被打后，委屈地哭了起来，什么也没说。过了一会儿，父亲突然崩溃了。请问这位父亲为什么过一会崩溃了？"
+}],
+"stream": true,
+"extra_body": {
+"anthropic_beta": ["interleaved-thinking-2025-05-14"],
+"thinking": {"type": "enabled", "budget_tokens": 4096}
+}
+}'
+```
diff --git a/scripts/push-to-ecr.sh b/scripts/push-to-ecr.sh
index ae51faef..7139750e 100755
--- a/scripts/push-to-ecr.sh
+++ b/scripts/push-to-ecr.sh
@@ -7,71 +7,118 @@ set -o errexit  # exit on first error
 set -o nounset  # exit on using unset variables
 set -o pipefail # exit on any error in a pipeline
 
+# Prompt user for inputs
+echo "================================================"
+echo "Bedrock Access Gateway - Build and Push to ECR"
+echo "================================================"
+echo ""
+
+# Get repository name for Lambda version
+read -p "Enter ECR repository name for Lambda (default: bedrock-proxy-api): " LAMBDA_REPO
+LAMBDA_REPO=${LAMBDA_REPO:-bedrock-proxy-api}
+
+# Get repository name for ECS/Fargate version
+read -p "Enter ECR repository name for ECS/Fargate (default: bedrock-proxy-api-ecs): " ECS_REPO
+ECS_REPO=${ECS_REPO:-bedrock-proxy-api-ecs}
+
+# Get image tag
+read -p "Enter image tag (default: latest): " TAG
+TAG=${TAG:-latest}
+
+# Get AWS region
+read -p "Enter AWS region (default: us-east-1): " AWS_REGION
+AWS_REGION=${AWS_REGION:-us-east-1}
+
+echo ""
+echo "Configuration:"
+echo "  Lambda Repository: $LAMBDA_REPO"
+echo "  ECS/Fargate Repository: $ECS_REPO"
+echo "  Image Tag: $TAG"
+echo "  AWS Region: $AWS_REGION"
+echo ""
+read -p "Continue with these settings? (y/n): " CONFIRM
+if [[ ! "$CONFIRM" =~ ^[Yy]$ ]]; then
+    echo "Aborted."
+    exit 1
+fi
+echo ""
+
+# Acknowledgment about ECR repository creation
+echo "ℹ️  NOTICE: This script will automatically create ECR repositories if they don't exist."
+echo "   The repositories will be created with the following default settings:"
+echo "   - Image tag mutability: MUTABLE (allows overwriting tags)"
+echo "   - Image scanning: Disabled"
+echo "   - Encryption: AES256 (AWS managed encryption)"
+echo ""
+echo "   You can modify these settings later in the AWS ECR Console if needed."
+echo "   Required IAM permissions: ecr:CreateRepository, ecr:GetAuthorizationToken,"
+echo "   ecr:BatchCheckLayerAvailability, ecr:InitiateLayerUpload, ecr:UploadLayerPart,"
+echo "   ecr:CompleteLayerUpload, ecr:PutImage"
+echo ""
+read -p "Do you acknowledge and want to proceed? (y/n): " ACK_CONFIRM
+if [[ ! "$ACK_CONFIRM" =~ ^[Yy]$ ]]; then
+    echo "Aborted."
+    exit 1
+fi
+echo ""
+
 # Define variables
-TAG="latest"
-ARCHS=("arm64" "amd64")
-AWS_REGIONS=("us-east-1") # List of AWS region, use below liest if you don't enable ECR repository replication
-# AWS_REGIONS=("us-east-1" "us-west-2" "eu-central-1" "ap-southeast-1" "ap-southeast-2" "ap-northeast-1" "eu-central-1" "eu-west-3") # List of supported AWS regions
+ARCHS=("arm64")  # Single architecture for simplicity
 
-build_and_push_images() {
+build_and_push_image() {
     local IMAGE_NAME=$1
     local TAG=$2
-    local ENABLE_MULTI_ARCH=${3:-true}  # Parameter for enabling multi-arch build, default is true
-    local DOCKERFILE_PATH=${4:-"../src/Dockerfile_ecs"}  # Parameter for Dockerfile path, default is "../src/Dockerfile_ecs"
-
-    # Build Docker image for each architecture
-    if [ "$ENABLE_MULTI_ARCH" == "true" ]; then
-        for ARCH in "${ARCHS[@]}"
-        do
-            # Build multi-architecture Docker image
-            docker buildx build --platform linux/$ARCH -t $IMAGE_NAME:$TAG-$ARCH -f $DOCKERFILE_PATH --load ../src/
-        done
-    else
-        # Build single architecture Docker image
-        docker buildx build --platform linux/${ARCHS[0]} -t $IMAGE_NAME:$TAG -f $DOCKERFILE_PATH --load ../src/
-    fi
-
-    # Push Docker image to ECR for each architecture in each AWS region
-    for REGION in "${AWS_REGIONS[@]}"
-    do
-        # Get the account ID for the current region
-        ACCOUNT_ID=$(aws sts get-caller-identity --region $REGION --query Account --output text)
-
-        # Create repository URI
-        REPOSITORY_URI="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${IMAGE_NAME}"
-
-        # Create ECR repository if it doesn't exist
-        aws ecr create-repository --repository-name "${IMAGE_NAME}" --region $REGION || true
-
-        # Log in to ECR
-        aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $REPOSITORY_URI
-
-        # Push the image to ECR for each architecture
-        if [ "$ENABLE_MULTI_ARCH" == "true" ]; then
-            for ARCH in "${ARCHS[@]}"
-            do
-                # Tag the image for the current region
-                docker tag $IMAGE_NAME:$TAG-$ARCH $REPOSITORY_URI:$TAG-$ARCH
-                # Push the image to ECR
-                docker push $REPOSITORY_URI:$TAG-$ARCH
-                # Create a manifest for the image
-                docker manifest create $REPOSITORY_URI:$TAG $REPOSITORY_URI:$TAG-$ARCH --amend
-                # Annotate the manifest with architecture information
-                docker manifest annotate $REPOSITORY_URI:$TAG "$REPOSITORY_URI:$TAG-$ARCH" --os linux --arch $ARCH
-            done
-
-            # Push the manifest to ECR
-            docker manifest push $REPOSITORY_URI:$TAG
-        else
-            # Tag the image for the current region
-            docker tag $IMAGE_NAME:$TAG $REPOSITORY_URI:$TAG
-            # Push the image to ECR
-            docker push $REPOSITORY_URI:$TAG
-        fi
-
-        echo "Pushed $IMAGE_NAME:$TAG to $REPOSITORY_URI"
-    done
+    local DOCKERFILE_PATH=$3
+    local REGION=$AWS_REGION
+    local ARCH=${ARCHS[0]}
+
+    echo "Building $IMAGE_NAME:$TAG..."
+
+    # Build Docker image
+    docker buildx build --platform linux/$ARCH -t $IMAGE_NAME:$TAG -f $DOCKERFILE_PATH --load ../src/
+
+    # Get the account ID
+    ACCOUNT_ID=$(aws sts get-caller-identity --region $REGION --query Account --output text)
+
+    # Create repository URI
+    REPOSITORY_URI="${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${IMAGE_NAME}"
+
+    echo "Creating ECR repository if it doesn't exist..."
+    # Create ECR repository if it doesn't exist
+    aws ecr create-repository --repository-name "${IMAGE_NAME}" --region $REGION || true
+
+    echo "Logging in to ECR..."
+    # Log in to ECR
+    aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $REPOSITORY_URI
+
+    echo "Pushing image to ECR..."
+    # Tag the image for ECR
+    docker tag $IMAGE_NAME:$TAG $REPOSITORY_URI:$TAG
+
+    # Push the image to ECR
+    docker push $REPOSITORY_URI:$TAG
+
+    echo "✅ Successfully pushed $IMAGE_NAME:$TAG to $REPOSITORY_URI"
+    echo ""
 }
 
-build_and_push_images "bedrock-proxy-api" "$TAG" "false" "../src/Dockerfile"
-build_and_push_images "bedrock-proxy-api-ecs" "$TAG"
+echo "Building and pushing Lambda image..."
+build_and_push_image "$LAMBDA_REPO" "$TAG" "../src/Dockerfile"
+
+echo "Building and pushing ECS/Fargate image..."
+build_and_push_image "$ECS_REPO" "$TAG" "../src/Dockerfile_ecs"
+
+echo "================================================"
+echo "✅ All images successfully pushed!"
+echo "================================================"
+echo ""
+echo "Your container image URIs:"
+ACCOUNT_ID=$(aws sts get-caller-identity --region $AWS_REGION --query Account --output text)
+echo "  Lambda: ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${LAMBDA_REPO}:${TAG}"
+echo "  ECS/Fargate: ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECS_REPO}:${TAG}"
+echo ""
+echo "Next steps:"
+echo "  1. Download the CloudFormation templates from deployment/ folder"
+echo "  2. Update the ContainerImageUri parameter with your image URI above"
+echo "  3. Deploy the stack via AWS CloudFormation Console"
+echo ""
diff --git a/src/Dockerfile_ecs b/src/Dockerfile_ecs
index c1240104..1fb8fa75 100644
--- a/src/Dockerfile_ecs
+++ b/src/Dockerfile_ecs
@@ -1,4 +1,4 @@
-FROM public.ecr.aws/docker/library/python:3.12-slim
+FROM public.ecr.aws/docker/library/python:3.13-slim
 
 WORKDIR /app
 
@@ -8,6 +8,19 @@ RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
 
 COPY ./api /app/api
 
-ENV PORT=80
+# Create non-root user
+RUN groupadd -r appuser && useradd -r -g appuser appuser && \
+    chown -R appuser:appuser /app
+
+USER appuser
+
+# Preload tiktoken encoding: https://github.com/aws-samples/bedrock-access-gateway/issues/118
+ENV TIKTOKEN_CACHE_DIR=/app/.cache/tiktoken
+RUN python3 -c 'import tiktoken_ext.openai_public as tke; tke.cl100k_base()'
+
+ENV PORT=8080
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health').read()"
 
 CMD ["sh", "-c", "uvicorn api.app:app --host 0.0.0.0 --port ${PORT}"]
diff --git a/src/api/app.py b/src/api/app.py
index 49a05190..ef63c550 100644
--- a/src/api/app.py
+++ b/src/api/app.py
@@ -1,4 +1,5 @@
 import logging
+import os
 
 import uvicorn
 from fastapi import FastAPI
@@ -23,9 +24,16 @@
 )
 app = FastAPI(**config)
 
+allowed_origins = os.environ.get("ALLOWED_ORIGINS", "*")
+origins_list = [origin.strip() for origin in allowed_origins.split(",")] if allowed_origins != "*" else ["*"]
+
+# Warn if CORS allows all origins
+if origins_list == ["*"]:
+    logging.warning("CORS is configured to allow all origins (*). Set ALLOWED_ORIGINS environment variable to restrict access.")
+
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
+    allow_origins=origins_list,  # nosec - configurable via ALLOWED_ORIGINS env var
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
@@ -45,10 +53,21 @@ async def health():
 
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request, exc):
+    logger = logging.getLogger(__name__)
+    
+    # Log essential info only - avoid sensitive data and performance overhead
+    logger.warning(
+        "Request validation failed: %s %s - %s", 
+        request.method, 
+        request.url.path,
+        str(exc).split('\n')[0]  # First line only
+    )
+    
     return PlainTextResponse(str(exc), status_code=400)
 
 
 handler = Mangum(app)
 
 if __name__ == "__main__":
-    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
+    # Bind to 0.0.0.0 for container environments, network is handled by network policies and load balancers
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)  # nosec B104
diff --git a/src/api/auth.py b/src/api/auth.py
index 1a64653d..52866512 100644
--- a/src/api/auth.py
+++ b/src/api/auth.py
@@ -7,8 +7,6 @@
 from fastapi import Depends, HTTPException, status
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 
-from api.setting import DEFAULT_API_KEYS
-
 api_key_param = os.environ.get("API_KEY_PARAM_NAME")
 api_key_secret_arn = os.environ.get("API_KEY_SECRET_ARN")
 api_key_env = os.environ.get("API_KEY")
@@ -31,8 +29,9 @@
 elif api_key_env:
     api_key = api_key_env
 else:
-    # For local use only.
-    api_key = DEFAULT_API_KEYS
+    raise RuntimeError(
+        "API Key is not configured. Please set up your API Key."
+    )
 
 security = HTTPBearer()
 
diff --git a/src/api/models/base.py b/src/api/models/base.py
index 6d45340e..5e7a9cb3 100644
--- a/src/api/models/base.py
+++ b/src/api/models/base.py
@@ -1,3 +1,4 @@
+import logging
 import time
 import uuid
 from abc import ABC, abstractmethod
@@ -14,6 +15,8 @@
     Error,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class BaseChatModel(ABC):
     """Represent a basic chat model
@@ -46,6 +49,7 @@ def generate_message_id() -> str:
     @staticmethod
     def stream_response_to_bytes(response: ChatStreamResponse | Error | None = None) -> bytes:
         if isinstance(response, Error):
+            logger.error("Stream error: %s", response.error.message if response.error else "Unknown error")
             data = response.model_dump_json()
         elif isinstance(response, ChatStreamResponse):
             # to populate other fields when using exclude_unset=True
diff --git a/src/api/models/bedrock.py b/src/api/models/bedrock.py
index 293f653a..66b74245 100644
--- a/src/api/models/bedrock.py
+++ b/src/api/models/bedrock.py
@@ -23,6 +23,7 @@
     ChatStreamResponse,
     Choice,
     ChoiceDelta,
+    CompletionTokensDetails,
     Embedding,
     EmbeddingsRequest,
     EmbeddingsResponse,
@@ -31,18 +32,35 @@
     ErrorMessage,
     Function,
     ImageContent,
+    PromptTokensDetails,
     ResponseFunction,
     TextContent,
     ToolCall,
+    ToolContent,
     ToolMessage,
     Usage,
     UserMessage,
 )
-from api.setting import AWS_REGION, DEBUG, DEFAULT_MODEL, ENABLE_CROSS_REGION_INFERENCE
+from api.setting import (
+    AWS_REGION,
+    DEBUG,
+    DEFAULT_MODEL,
+    ENABLE_CROSS_REGION_INFERENCE,
+    ENABLE_APPLICATION_INFERENCE_PROFILES,
+    ENABLE_PROMPT_CACHING,
+)
 
 logger = logging.getLogger(__name__)
 
-config = Config(connect_timeout=60, read_timeout=120, retries={"max_attempts": 1})
+config = Config(
+            connect_timeout=60,      # Connection timeout: 60 seconds
+            read_timeout=900,        # Read timeout: 15 minutes (suitable for long streaming responses)
+            retries={
+                'max_attempts': 8,   # Maximum retry attempts
+                'mode': 'adaptive'   # Adaptive retry mode
+            },
+            max_pool_connections=50  # Maximum connection pool size
+        )
 
 bedrock_runtime = boto3.client(
     service_name="bedrock-runtime",
@@ -55,26 +73,29 @@
     config=config,
 )
 
-
-def get_inference_region_prefix():
-    if AWS_REGION.startswith("ap-"):
-        return "apac"
-    return AWS_REGION[:2]
-
-
-# https://docs.aws.amazon.com/bedrock/latest/userguide/inference-profiles-support.html
-cr_inference_prefix = get_inference_region_prefix()
-
 SUPPORTED_BEDROCK_EMBEDDING_MODELS = {
     "cohere.embed-multilingual-v3": "Cohere Embed Multilingual",
     "cohere.embed-english-v3": "Cohere Embed English",
+    "amazon.titan-embed-text-v1": "Titan Embeddings G1 - Text",
+    "amazon.titan-embed-text-v2:0": "Titan Embeddings G2 - Text",
     # Disable Titan embedding.
-    # "amazon.titan-embed-text-v1": "Titan Embeddings G1 - Text",
     # "amazon.titan-embed-image-v1": "Titan Multimodal Embeddings G1"
 }
 
 ENCODER = tiktoken.get_encoding("cl100k_base")
 
+# Global mapping: Profile ID/ARN → Foundation Model ID
+# Handles both SYSTEM_DEFINED (cross-region) and APPLICATION profiles
+# This enables feature detection for all profile types without pattern matching
+profile_metadata = {}
+
+# Models that don't support both temperature and topP simultaneously
+# When both are provided, temperature takes precedence and topP is removed
+TEMPERATURE_TOPP_CONFLICT_MODELS = {
+    "claude-sonnet-4-5",
+    "claude-haiku-4-5",
+}
+
 
 def list_bedrock_models() -> dict:
     """Automatically getting a list of supported models.
@@ -82,15 +103,66 @@ def list_bedrock_models() -> dict:
     Returns a model list combines:
         - ON_DEMAND models.
         - Cross-Region Inference Profiles (if enabled via Env)
+        - Application Inference Profiles (if enabled via Env)
     """
     model_list = {}
     try:
-        profile_list = []
         if ENABLE_CROSS_REGION_INFERENCE:
-            # List system defined inference profile IDs
-            response = bedrock_client.list_inference_profiles(maxResults=1000, typeEquals="SYSTEM_DEFINED")
-            profile_list = [p["inferenceProfileId"] for p in response["inferenceProfileSummaries"]]
+            # List system defined inference profile IDs and store underlying model mapping
+            paginator = bedrock_client.get_paginator('list_inference_profiles')
+            for page in paginator.paginate(maxResults=1000, typeEquals="SYSTEM_DEFINED"):
+                for profile in page["inferenceProfileSummaries"]:
+                    profile_id = profile.get("inferenceProfileId")
+                    if not profile_id:
+                        continue
+
+                    # Extract underlying model from first model in the profile
+                    models = profile.get("models", [])
+                    if models:
+                        model_arn = models[0].get("modelArn", "")
+                        if model_arn:
+                            # Extract foundation model ID from ARN
+                            model_id = model_arn.split('/')[-1]
+                            profile_metadata[profile_id] = {
+                                "underlying_model_id": model_id,
+                                "profile_type": "SYSTEM_DEFINED",
+                            }
 
+        if ENABLE_APPLICATION_INFERENCE_PROFILES:
+            # List application defined inference profile IDs and create mapping
+            paginator = bedrock_client.get_paginator('list_inference_profiles')
+            for page in paginator.paginate(maxResults=1000, typeEquals="APPLICATION"):
+                for profile in page["inferenceProfileSummaries"]:
+                    try:
+                        profile_arn = profile.get("inferenceProfileArn")
+                        if not profile_arn:
+                            continue
+
+                        # Process all models in the profile
+                        models = profile.get("models", [])
+                        if not models:
+                            logger.warning(f"Application profile {profile_arn} has no models")
+                            continue
+
+                        # Take first model - all models in array are same type (regional instances)
+                        first_model = models[0]
+                        model_arn = first_model.get("modelArn", "")
+                        if not model_arn:
+                            continue
+
+                        # Extract model ID from ARN (works for both foundation models and cross-region profiles)
+                        model_id = model_arn.split('/')[-1] if '/' in model_arn else model_arn
+
+                        # Store in unified profile metadata for feature detection
+                        profile_metadata[profile_arn] = {
+                            "underlying_model_id": model_id,
+                            "profile_type": "APPLICATION",
+                            "profile_name": profile.get("inferenceProfileName", ""),
+                        }
+                    except Exception as e:
+                        logger.warning(f"Error processing application profile: {e}")
+                        continue
+                    
         # List foundation models, only cares about text outputs here.
         response = bedrock_client.list_foundation_models(byOutputModality="TEXT")
 
@@ -109,10 +181,10 @@ def list_bedrock_models() -> dict:
             if "ON_DEMAND" in inference_types:
                 model_list[model_id] = {"modalities": input_modalities}
 
-            # Add cross-region inference model list.
-            profile_id = cr_inference_prefix + "." + model_id
-            if profile_id in profile_list:
-                model_list[profile_id] = {"modalities": input_modalities}
+            # Add all inference profiles (cross-region and application) for this model
+            for profile_id, metadata in profile_metadata.items():
+                if metadata.get("underlying_model_id") == model_id:
+                    model_list[profile_id] = {"modalities": input_modalities}
 
     except Exception as e:
         logger.error(f"Unable to list models: {str(e)}")
@@ -140,7 +212,26 @@ def validate(self, chat_request: ChatRequest):
         error = ""
         # check if model is supported
         if chat_request.model not in bedrock_model_list.keys():
-            error = f"Unsupported model {chat_request.model}, please use models API to get a list of supported models"
+            # Provide helpful error for application profiles
+            if "application-inference-profile" in chat_request.model:
+                error = (
+                    f"Application profile {chat_request.model} not found. "
+                    f"Available profiles can be listed via GET /models API. "
+                    f"Ensure ENABLE_APPLICATION_INFERENCE_PROFILES=true and "
+                    f"the profile exists in your AWS account."
+                )
+            else:
+                error = f"Unsupported model {chat_request.model}, please use models API to get a list of supported models"
+            logger.error("Unsupported model: %s", chat_request.model)
+
+        # Validate profile has resolvable underlying model
+        if not error and chat_request.model in profile_metadata:
+            resolved = self._resolve_to_foundation_model(chat_request.model)
+            if resolved == chat_request.model:
+                logger.warning(
+                    f"Could not resolve profile {chat_request.model} "
+                    f"to underlying model. Some features may not work correctly."
+                )
 
         if error:
             raise HTTPException(
@@ -148,11 +239,101 @@ def validate(self, chat_request: ChatRequest):
                 detail=error,
             )
 
+    def _resolve_to_foundation_model(self, model_id: str) -> str:
+        """
+        Resolve any model identifier to foundation model ID for feature detection.
+
+        Handles:
+        - Cross-region profiles (us.*, eu.*, apac.*, global.*)
+        - Application profiles (arn:aws:bedrock:...:application-inference-profile/...)
+        - Foundation models (pass through unchanged)
+
+        No pattern matching needed - just dictionary lookup.
+        Unknown identifiers pass through unchanged (graceful fallback).
+
+        Args:
+            model_id: Can be foundation model ID, cross-region profile, or app profile ARN
+
+        Returns:
+            Foundation model ID if mapping exists, otherwise original model_id
+        """
+        if model_id in profile_metadata:
+            return profile_metadata[model_id]["underlying_model_id"]
+        return model_id
+
+    def _supports_prompt_caching(self, model_id: str) -> bool:
+        """
+        Check if model supports prompt caching based on model ID pattern.
+
+        Uses pattern matching instead of hardcoded whitelist for better maintainability.
+        Automatically supports new models following the naming convention.
+
+        Supported models:
+        - Claude: anthropic.claude-* (excluding very old versions)
+        - Nova: amazon.nova-*
+
+        Returns:
+            bool: True if model supports prompt caching
+        """
+        # Resolve profile to underlying model for feature detection
+        resolved_model = self._resolve_to_foundation_model(model_id)
+        model_lower = resolved_model.lower()
+
+        # Claude models pattern matching
+        if "anthropic.claude" in model_lower:
+            # Exclude very old models that don't support caching
+            excluded_patterns = ["claude-instant", "claude-v1", "claude-v2"]
+            if any(pattern in model_lower for pattern in excluded_patterns):
+                return False
+            return True
+
+        # Nova models pattern matching
+        if "amazon.nova" in model_lower:
+            return True
+
+        # Future providers can be added here
+        # Example: if "provider.model-name" in model_lower: return True
+
+        return False
+
+    def _get_max_cache_tokens(self, model_id: str) -> int | None:
+        """
+        Get maximum cacheable tokens limit for the model.
+
+        Different models have different caching limits:
+        - Claude: No explicit limit mentioned in docs
+        - Nova: 20,000 tokens max
+
+        Returns:
+            int | None: Max tokens, or None if unlimited
+        """
+        # Resolve profile to underlying model for feature detection
+        resolved_model = self._resolve_to_foundation_model(model_id)
+        model_lower = resolved_model.lower()
+
+        # Nova models have 20K limit
+        if "amazon.nova" in model_lower:
+            return 20_000
+
+        # Claude: No explicit limit
+        if "anthropic.claude" in model_lower:
+            return None
+
+        return None
+
     async def _invoke_bedrock(self, chat_request: ChatRequest, stream=False):
         """Common logic for invoke bedrock models"""
         if DEBUG:
             logger.info("Raw request: " + chat_request.model_dump_json())
 
+            # Log profile resolution for debugging
+            if chat_request.model in profile_metadata:
+                resolved = self._resolve_to_foundation_model(chat_request.model)
+                profile_type = profile_metadata[chat_request.model].get("profile_type", "UNKNOWN")
+                logger.info(
+                    f"Profile resolution: {chat_request.model} ({profile_type}) → {resolved}"
+                )
+
         # convert OpenAI chat request to Bedrock SDK request
         args = self._parse_request(chat_request)
         if DEBUG:
@@ -161,18 +342,20 @@ async def _invoke_bedrock(self, chat_request: ChatRequest, stream=False):
         try:
             if stream:
                 # Run the blocking boto3 call in a thread pool
-                response = await run_in_threadpool(bedrock_runtime.converse_stream, **args)
+                response = await run_in_threadpool(
+                    bedrock_runtime.converse_stream, **args
+                )
             else:
                 # Run the blocking boto3 call in a thread pool
                 response = await run_in_threadpool(bedrock_runtime.converse, **args)
         except bedrock_runtime.exceptions.ValidationException as e:
-            logger.error("Validation Error: " + str(e))
+            logger.error("Bedrock validation error for model %s: %s", chat_request.model, str(e))
             raise HTTPException(status_code=400, detail=str(e))
         except bedrock_runtime.exceptions.ThrottlingException as e:
-            logger.error("Throttling Error: " + str(e))
+            logger.warning("Bedrock throttling for model %s: %s", chat_request.model, str(e))
             raise HTTPException(status_code=429, detail=str(e))
         except Exception as e:
-            logger.error(e)
+            logger.error("Bedrock invocation failed for model %s: %s", chat_request.model, str(e))
             raise HTTPException(status_code=500, detail=str(e))
         return response
 
@@ -183,17 +366,32 @@ async def chat(self, chat_request: ChatRequest) -> ChatResponse:
         response = await self._invoke_bedrock(chat_request)
 
         output_message = response["output"]["message"]
-        input_tokens = response["usage"]["inputTokens"]
-        output_tokens = response["usage"]["outputTokens"]
+        usage = response["usage"]
+
+        # Extract all token counts
+        output_tokens = usage["outputTokens"]
+        total_tokens = usage["totalTokens"]
         finish_reason = response["stopReason"]
 
+        # Extract prompt caching metrics if available
+        cache_read_tokens = usage.get("cacheReadInputTokens", 0)
+        cache_creation_tokens = usage.get("cacheCreationInputTokens", 0)
+
+        # Calculate actual prompt tokens
+        # Bedrock's totalTokens includes all: inputTokens + cacheRead + cacheWrite + outputTokens
+        # So: prompt_tokens = totalTokens - outputTokens
+        actual_prompt_tokens = total_tokens - output_tokens
+
         chat_response = self._create_response(
             model=chat_request.model,
             message_id=message_id,
             content=output_message["content"],
             finish_reason=finish_reason,
-            input_tokens=input_tokens,
+            input_tokens=actual_prompt_tokens,
             output_tokens=output_tokens,
+            total_tokens=total_tokens,
+            cache_read_tokens=cache_read_tokens,
+            cache_creation_tokens=cache_creation_tokens,
         )
         if DEBUG:
             logger.info("Proxy response :" + chat_response.model_dump_json())
@@ -211,6 +409,7 @@ async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]:
             response = await self._invoke_bedrock(chat_request, stream=True)
             message_id = self.generate_message_id()
             stream = response.get("stream")
+            self.think_emitted = False
             async for chunk in self._async_iterate(stream):
                 args = {"model_id": chat_request.model, "message_id": message_id, "chunk": chunk}
                 stream_response = self._create_response_stream(**args)
@@ -231,28 +430,75 @@ async def chat_stream(self, chat_request: ChatRequest) -> AsyncIterable[bytes]:
 
             # return an [DONE] message at the end.
             yield self.stream_response_to_bytes()
+            self.think_emitted = False  # Cleanup
         except Exception as e:
+            logger.error("Stream error for model %s: %s", chat_request.model, str(e))
             error_event = Error(error=ErrorMessage(message=str(e)))
             yield self.stream_response_to_bytes(error_event)
 
     def _parse_system_prompts(self, chat_request: ChatRequest) -> list[dict[str, str]]:
-        """Create system prompts.
-        Note that not all models support system prompts.
+        """Create system prompts with optional prompt caching support.
 
-        example output: [{"text" : system_prompt}]
+        Prompt caching can be enabled via:
+        1. ENABLE_PROMPT_CACHING environment variable (global default)
+        2. extra_body.prompt_caching.system = True/False (per-request override)
 
-        See example:
-        https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference.html#message-inference-examples
-        """
+        Only adds cachePoint if:
+        - Model supports caching (Claude, Nova)
+        - Caching is enabled (ENV or extra_body)
+        - System prompts exist and meet minimum token requirements
+
+        Example output: [{"text" : system_prompt}, {"cachePoint": {"type": "default"}}]
 
+        See: https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html
+        """
         system_prompts = []
         for message in chat_request.messages:
             if message.role != "system":
-                # ignore system messages here
                 continue
-            assert isinstance(message.content, str)
+            if not isinstance(message.content, str):
+                raise TypeError(f"System message content must be a string, got {type(message.content).__name__}")
             system_prompts.append({"text": message.content})
 
+        if not system_prompts:
+            return system_prompts
+
+        # Check if model supports prompt caching
+        if not self._supports_prompt_caching(chat_request.model):
+            return system_prompts
+
+        # Determine if caching should be enabled
+        cache_enabled = ENABLE_PROMPT_CACHING  # Default from ENV
+
+        # Check for extra_body override
+        if chat_request.extra_body and isinstance(chat_request.extra_body, dict):
+            prompt_caching = chat_request.extra_body.get("prompt_caching", {})
+            if "system" in prompt_caching:
+                # extra_body explicitly controls caching
+                cache_enabled = prompt_caching.get("system") is True
+
+        if not cache_enabled:
+            return system_prompts
+
+        # Estimate total tokens for limit check
+        total_text = " ".join(p.get("text", "") for p in system_prompts)
+        estimated_tokens = len(total_text.split()) * 1.3  # Rough estimate
+
+        # Check token limits (Nova has 20K limit)
+        max_tokens = self._get_max_cache_tokens(chat_request.model)
+        if max_tokens and estimated_tokens > max_tokens:
+            logger.warning(
+                f"System prompts (~{estimated_tokens:.0f} tokens) exceed model cache limit ({max_tokens} tokens). "
+                f"Caching will still be attempted but may not work optimally."
+            )
+            # Still add cachePoint - let Bedrock handle the limit
+
+        # Add cache checkpoint after system prompts
+        system_prompts.append({"cachePoint": {"type": "default"}})
+
+        if DEBUG:
+            logger.info(f"Added cachePoint to system prompts for model {chat_request.model}")
+
         return system_prompts
 
     def _parse_messages(self, chat_request: ChatRequest) -> list[dict]:
@@ -273,16 +519,29 @@ def _parse_messages(self, chat_request: ChatRequest) -> list[dict]:
                 messages.append(
                     {
                         "role": message.role,
-                        "content": self._parse_content_parts(message, chat_request.model),
+                        "content": self._parse_content_parts(
+                            message, chat_request.model
+                        ),
                     }
                 )
             elif isinstance(message, AssistantMessage):
-                if message.content.strip():
+                # Check if message has content that's not empty
+                has_content = False
+                if isinstance(message.content, str):
+                    has_content = message.content.strip() != ""
+                elif isinstance(message.content, list):
+                    has_content = len(message.content) > 0
+                elif message.content is not None:
+                    has_content = True
+                
+                if has_content:
                     # Text message
                     messages.append(
                         {
                             "role": message.role,
-                            "content": self._parse_content_parts(message, chat_request.model),
+                            "content": self._parse_content_parts(
+                                message, chat_request.model
+                            ),
                         }
                     )
                 if message.tool_calls:
@@ -307,6 +566,10 @@ def _parse_messages(self, chat_request: ChatRequest) -> list[dict]:
                 # Bedrock does not support tool role,
                 # Add toolResult to content
                 # https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ToolResultBlock.html
+                
+                # Handle different content formats from OpenAI SDK
+                tool_content = self._extract_tool_content(message.content)
+                
                 messages.append(
                     {
                         "role": "user",
@@ -314,7 +577,7 @@ def _parse_messages(self, chat_request: ChatRequest) -> list[dict]:
                             {
                                 "toolResult": {
                                     "toolUseId": message.tool_call_id,
-                                    "content": [{"text": message.content}],
+                                    "content": [{"text": tool_content}],
                                 }
                             }
                         ],
@@ -324,9 +587,60 @@ def _parse_messages(self, chat_request: ChatRequest) -> list[dict]:
             else:
                 # ignore others, such as system messages
                 continue
-        return self._reframe_multi_payloard(messages)
+        return self._reframe_multi_payloard(messages, chat_request)
+
+    def _extract_tool_content(self, content) -> str:
+        """Extract text content from various OpenAI SDK tool message formats.
+        
+        Handles:
+        - String content (legacy format)
+        - List of content objects (OpenAI SDK 1.91.0+)
+        - Nested JSON structures within text content
+        """
+        try:
+            if isinstance(content, str):
+                return content
+            
+            if isinstance(content, list):
+                text_parts = []
+                for i, item in enumerate(content):
+                    if isinstance(item, dict):
+                        # Handle dict with 'text' field
+                        if "text" in item:
+                            item_text = item["text"]
+                            if isinstance(item_text, str):
+                                # Try to parse as JSON if it looks like JSON
+                                if item_text.strip().startswith('{') and item_text.strip().endswith('}'):
+                                    try:
+                                        parsed_json = json.loads(item_text)
+                                        # Convert JSON object to readable text
+                                        text_parts.append(json.dumps(parsed_json, indent=2))
+                                    except json.JSONDecodeError:
+                                        # Silently fallback to original text
+                                        text_parts.append(item_text)
+                                else:
+                                    text_parts.append(item_text)
+                            else:
+                                text_parts.append(str(item_text))
+                        else:
+                            # Handle other dict formats - convert to JSON string
+                            text_parts.append(json.dumps(item, indent=2))
+                    elif hasattr(item, 'text'):
+                        # Handle ToolContent objects
+                        text_parts.append(item.text)
+                    else:
+                        # Convert any other type to string
+                        text_parts.append(str(item))
+                return "\n".join(text_parts)
+            
+            # Fallback for any other type
+            return str(content)
+        except Exception as e:
+            logger.warning("Tool content extraction failed: %s", str(e))
+            # Return a safe fallback
+            return str(content) if content is not None else ""
 
-    def _reframe_multi_payloard(self, messages: list) -> list:
+    def _reframe_multi_payloard(self, messages: list, chat_request: ChatRequest = None) -> list:
         """Receive messages and reformat them to comply with the Claude format
 
         With OpenAI format requests, it's not a problem to repeatedly receive messages from the same role, but
@@ -362,7 +676,9 @@ def _reframe_multi_payloard(self, messages: list) -> list:
             # If the next role is different from the previous message, add the previous role's messages to the list
             if next_role != current_role:
                 if current_content:
-                    reformatted_messages.append({"role": current_role, "content": current_content})
+                    reformatted_messages.append(
+                        {"role": current_role, "content": current_content}
+                    )
                 # Switch to the new role
                 current_role = next_role
                 current_content = []
@@ -375,7 +691,32 @@ def _reframe_multi_payloard(self, messages: list) -> list:
 
         # Add the last role's messages to the list
         if current_content:
-            reformatted_messages.append({"role": current_role, "content": current_content})
+            reformatted_messages.append(
+                {"role": current_role, "content": current_content}
+            )
+
+        # Add cachePoint to messages if enabled and supported
+        if chat_request and reformatted_messages:
+            if not self._supports_prompt_caching(chat_request.model):
+                return reformatted_messages
+
+            # Determine if messages caching should be enabled
+            cache_enabled = ENABLE_PROMPT_CACHING
+
+            if chat_request.extra_body and isinstance(chat_request.extra_body, dict):
+                prompt_caching = chat_request.extra_body.get("prompt_caching", {})
+                if "messages" in prompt_caching:
+                    cache_enabled = prompt_caching.get("messages") is True
+
+            if cache_enabled:
+                # Add cachePoint to the last user message content
+                for msg in reversed(reformatted_messages):
+                    if msg["role"] == "user" and msg.get("content"):
+                        # Add cachePoint at the end of user message content
+                        msg["content"].append({"cachePoint": {"type": "default"}})
+                        if DEBUG:
+                            logger.info(f"Added cachePoint to last user message for model {chat_request.model}")
+                        break
 
         return reformatted_messages
 
@@ -391,11 +732,28 @@ def _parse_request(self, chat_request: ChatRequest) -> dict:
 
         # Base inference parameters.
         inference_config = {
-            "temperature": chat_request.temperature,
             "maxTokens": chat_request.max_tokens,
-            "topP": chat_request.top_p,
         }
 
+        # Only include optional parameters when specified
+        if chat_request.temperature is not None:
+            inference_config["temperature"] = chat_request.temperature
+        if chat_request.top_p is not None:
+            inference_config["topP"] = chat_request.top_p
+
+        # Some models (Claude Sonnet 4.5, Haiku 4.5) don't support both temperature and topP
+        # When both are provided, keep temperature and remove topP
+        # Resolve profile to underlying model for feature detection
+        resolved_model = self._resolve_to_foundation_model(chat_request.model)
+        model_lower = resolved_model.lower()
+
+        # Check if model is in the conflict list and both parameters are present
+        if "temperature" in inference_config and "topP" in inference_config:
+            if any(conflict_model in model_lower for conflict_model in TEMPERATURE_TOPP_CONFLICT_MODELS):
+                inference_config.pop("topP", None)
+                if DEBUG:
+                    logger.info(f"Removed topP for {chat_request.model} (conflicts with temperature)")
+
         if chat_request.stop is not None:
             stop = chat_request.stop
             if isinstance(stop, str):
@@ -409,25 +767,48 @@ def _parse_request(self, chat_request: ChatRequest) -> dict:
             "inferenceConfig": inference_config,
         }
         if chat_request.reasoning_effort:
-            # From OpenAI api, the max_token is not supported in reasoning mode
-            # Use max_completion_tokens if provided.
-
-            max_tokens = (
-                chat_request.max_completion_tokens if chat_request.max_completion_tokens else chat_request.max_tokens
-            )
-            budget_tokens = self._calc_budget_tokens(max_tokens, chat_request.reasoning_effort)
-            inference_config["maxTokens"] = max_tokens
-            # unset topP - Not supported
-            inference_config.pop("topP")
+            # reasoning_effort is supported by Claude and DeepSeek v3
+            # Different models use different formats
+            # Resolve profile to underlying model for feature detection
+            resolved_model = self._resolve_to_foundation_model(chat_request.model)
+            model_lower = resolved_model.lower()
+
+            if "anthropic.claude" in model_lower:
+                # Claude format: reasoning_config = object with budget_tokens
+                max_tokens = (
+                    chat_request.max_completion_tokens
+                    if chat_request.max_completion_tokens
+                    else chat_request.max_tokens
+                )
+                budget_tokens = self._calc_budget_tokens(
+                    max_tokens, chat_request.reasoning_effort
+                )
+                inference_config["maxTokens"] = max_tokens
+                # unset topP - Not supported
+                inference_config.pop("topP", None)
 
-            args["additionalModelRequestFields"] = {
-                "reasoning_config": {"type": "enabled", "budget_tokens": budget_tokens}
-            }
+                args["additionalModelRequestFields"] = {
+                    "reasoning_config": {"type": "enabled", "budget_tokens": budget_tokens}
+                }
+            elif "deepseek.v3" in model_lower or "deepseek.deepseek-v3" in model_lower:
+                # DeepSeek v3 format: reasoning_config = string ('low', 'medium', 'high')
+                # From Bedrock Playground: {"reasoning_config": "high"}
+                args["additionalModelRequestFields"] = {
+                    "reasoning_config": chat_request.reasoning_effort  # Direct string: low/medium/high
+                }
+                if DEBUG:
+                    logger.info(f"Applied reasoning_config={chat_request.reasoning_effort} for DeepSeek v3")
+            else:
+                # For other models (Qwen, etc.), ignore reasoning_effort parameter
+                if DEBUG:
+                    logger.info(f"reasoning_effort parameter ignored for model {chat_request.model} (not supported)")
         # add tool config
         if chat_request.tools:
             tool_config = {"tools": [self._convert_tool_spec(t.function) for t in chat_request.tools]}
 
-            if chat_request.tool_choice and not chat_request.model.startswith("meta.llama3-1-"):
+            if chat_request.tool_choice and not chat_request.model.startswith(
+                "meta.llama3-1-"
+            ):
                 if isinstance(chat_request.tool_choice, str):
                     # auto (default) is mapped to {"auto" : {}}
                     # required is mapped to {"any" : {}}
@@ -437,11 +818,46 @@ def _parse_request(self, chat_request: ChatRequest) -> dict:
                         tool_config["toolChoice"] = {"auto": {}}
                 else:
                     # Specific tool to use
-                    assert "function" in chat_request.tool_choice
+                    if "function" not in chat_request.tool_choice:
+                        raise ValueError("tool_choice must contain 'function' key when specifying a specific tool")
                     tool_config["toolChoice"] = {"tool": {"name": chat_request.tool_choice["function"].get("name", "")}}
             args["toolConfig"] = tool_config
+        # Add additional fields to enable extend thinking or other model-specific features
+        if chat_request.extra_body:
+            # Filter out prompt_caching (our control field, not for Bedrock)
+            additional_fields = {
+                k: v for k, v in chat_request.extra_body.items()
+                if k != "prompt_caching"
+            }
+
+            if additional_fields:
+                # Only set additionalModelRequestFields if there are actual fields to pass
+                args["additionalModelRequestFields"] = additional_fields
+
+                # Extended thinking doesn't support both temperature and topP
+                # Remove topP to avoid validation error
+                if "thinking" in additional_fields:
+                    inference_config.pop("topP", None)
+
         return args
 
+    def _estimate_reasoning_tokens(self, content: list[dict]) -> int:
+        """
+        Estimate reasoning tokens from reasoningContent blocks.
+
+        Bedrock doesn't separately report reasoning tokens, so we estimate
+        them using tiktoken to maintain OpenAI API compatibility.
+        """
+        reasoning_text = ""
+        for block in content:
+            if "reasoningContent" in block:
+                reasoning_text += block["reasoningContent"]["reasoningText"].get("text", "")
+
+        if reasoning_text:
+            # Use tiktoken to estimate token count
+            return len(ENCODER.encode(reasoning_text))
+        return 0
+
     def _create_response(
         self,
         model: str,
@@ -450,6 +866,9 @@ def _create_response(
         finish_reason: str | None = None,
         input_tokens: int = 0,
         output_tokens: int = 0,
+        total_tokens: int = 0,
+        cache_read_tokens: int = 0,
+        cache_creation_tokens: int = 0,
     ) -> ChatResponse:
         message = ChatResponseMessage(
             role="assistant",
@@ -476,11 +895,37 @@ def _create_response(
             message.content = ""
             for c in content:
                 if "reasoningContent" in c:
-                    message.reasoning_content = c["reasoningContent"]["reasoningText"].get("text", "")
+                    message.reasoning_content = c["reasoningContent"][
+                        "reasoningText"
+                    ].get("text", "")
                 elif "text" in c:
                     message.content = c["text"]
                 else:
-                    logger.warning("Unknown tag in message content " + ",".join(c.keys()))
+                    logger.warning(
+                        "Unknown tag in message content " + ",".join(c.keys())
+                    )
+            if message.reasoning_content:
+                message.content = f"<think>{message.reasoning_content}</think>{message.content}"
+                message.reasoning_content = None
+
+        # Create prompt_tokens_details if cache metrics are available
+        prompt_tokens_details = None
+        if cache_read_tokens > 0 or cache_creation_tokens > 0:
+            # Map Bedrock cache metrics to OpenAI format
+            # cached_tokens represents tokens read from cache (cache hits)
+            prompt_tokens_details = PromptTokensDetails(
+                cached_tokens=cache_read_tokens,
+                audio_tokens=0,
+            )
+
+        # Create completion_tokens_details if reasoning content exists
+        completion_tokens_details = None
+        reasoning_tokens = self._estimate_reasoning_tokens(content) if content else 0
+        if reasoning_tokens > 0:
+            completion_tokens_details = CompletionTokensDetails(
+                reasoning_tokens=reasoning_tokens,
+                audio_tokens=0,
+            )
 
         response = ChatResponse(
             id=message_id,
@@ -496,7 +941,9 @@ def _create_response(
             usage=Usage(
                 prompt_tokens=input_tokens,
                 completion_tokens=output_tokens,
-                total_tokens=input_tokens + output_tokens,
+                total_tokens=total_tokens if total_tokens > 0 else input_tokens + output_tokens,
+                prompt_tokens_details=prompt_tokens_details,
+                completion_tokens_details=completion_tokens_details,
             ),
         )
         response.system_fingerprint = "fp"
@@ -504,7 +951,9 @@ def _create_response(
         response.created = int(time.time())
         return response
 
-    def _create_response_stream(self, model_id: str, message_id: str, chunk: dict) -> ChatStreamResponse | None:
+    def _create_response_stream(
+        self, model_id: str, message_id: str, chunk: dict
+    ) -> ChatStreamResponse | None:
         """Parsing the Bedrock stream response chunk.
 
         Ref: https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference.html#message-inference-examples
@@ -547,11 +996,19 @@ def _create_response_stream(self, model_id: str, message_id: str, chunk: dict) -
                     content=delta["text"],
                 )
             elif "reasoningContent" in delta:
-                # ignore "signature" in the delta.
                 if "text" in delta["reasoningContent"]:
-                    message = ChatResponseMessage(
-                        reasoning_content=delta["reasoningContent"]["text"],
-                    )
+                    content = delta["reasoningContent"]["text"]
+                    if not self.think_emitted:
+                        # Port of "content_block_start" with "thinking"
+                        content = "<think>" + content
+                        self.think_emitted = True
+                    message = ChatResponseMessage(content=content)
+                elif "signature" in delta["reasoningContent"]:
+                    # Port of "signature_delta"
+                    if self.think_emitted:
+                        message = ChatResponseMessage(content="\n </think> \n\n")
+                    else:
+                        return None  # Ignore signature if no <think> started
             else:
                 # tool use
                 index = chunk["contentBlockDelta"]["contentBlockIndex"] - 1
@@ -574,14 +1031,36 @@ def _create_response_stream(self, model_id: str, message_id: str, chunk: dict) -
             metadata = chunk["metadata"]
             if "usage" in metadata:
                 # token usage
+                usage_data = metadata["usage"]
+
+                # Extract prompt caching metrics if available
+                cache_read_tokens = usage_data.get("cacheReadInputTokens", 0)
+                cache_creation_tokens = usage_data.get("cacheCreationInputTokens", 0)
+
+                # Create prompt_tokens_details if cache metrics are available
+                prompt_tokens_details = None
+                if cache_read_tokens > 0 or cache_creation_tokens > 0:
+                    prompt_tokens_details = PromptTokensDetails(
+                        cached_tokens=cache_read_tokens,
+                        audio_tokens=0,
+                    )
+
+                # Calculate actual prompt tokens
+                # Bedrock's totalTokens includes all tokens
+                # prompt_tokens = totalTokens - outputTokens
+                total_tokens = usage_data["totalTokens"]
+                output_tokens = usage_data["outputTokens"]
+                actual_prompt_tokens = total_tokens - output_tokens
+
                 return ChatStreamResponse(
                     id=message_id,
                     model=model_id,
                     choices=[],
                     usage=Usage(
-                        prompt_tokens=metadata["usage"]["inputTokens"],
-                        completion_tokens=metadata["usage"]["outputTokens"],
-                        total_tokens=metadata["usage"]["totalTokens"],
+                        prompt_tokens=actual_prompt_tokens,
+                        completion_tokens=output_tokens,
+                        total_tokens=total_tokens,
+                        prompt_tokens_details=prompt_tokens_details,
                     ),
                 )
         if message:
@@ -616,7 +1095,7 @@ def _parse_image(self, image_url: str) -> tuple[bytes, str]:
             return base64.b64decode(image_data), content_type.group(1)
 
         # Send a request to the image URL
-        response = requests.get(image_url)
+        response = requests.get(image_url, timeout=30)
         # Check if the request was successful
         if response.status_code == 200:
             content_type = response.headers.get("Content-Type")
@@ -626,7 +1105,9 @@ def _parse_image(self, image_url: str) -> tuple[bytes, str]:
             image_content = response.content
             return image_content, content_type
         else:
-            raise HTTPException(status_code=500, detail="Unable to access the image url")
+            raise HTTPException(
+                status_code=500, detail="Unable to access the image url"
+            )
 
     def _parse_content_parts(
         self,
@@ -686,7 +1167,9 @@ def _convert_tool_spec(self, func: Function) -> dict:
             }
         }
 
-    def _calc_budget_tokens(self, max_tokens: int, reasoning_effort: Literal["low", "medium", "high"]) -> int:
+    def _calc_budget_tokens(
+        self, max_tokens: int, reasoning_effort: Literal["low", "medium", "high"]
+    ) -> int:
         # Helper function to calculate budget_tokens based on the max_tokens.
         # Ratio for efforts:  Low - 30%, medium - 60%, High: Max token - 1
         # Note that The minimum budget_tokens is 1,024 tokens so far.
@@ -717,7 +1200,9 @@ def _convert_finish_reason(self, finish_reason: str | None) -> str | None:
                 "complete": "stop",
                 "content_filtered": "content_filter",
             }
-            return finish_reason_mapping.get(finish_reason.lower(), finish_reason.lower())
+            return finish_reason_mapping.get(
+                finish_reason.lower(), finish_reason.lower()
+            )
         return None
 
 
@@ -808,7 +1293,9 @@ def _parse_args(self, embeddings_request: EmbeddingsRequest) -> dict:
         return args
 
     def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse:
-        response = self._invoke_model(args=self._parse_args(embeddings_request), model_id=embeddings_request.model)
+        response = self._invoke_model(
+            args=self._parse_args(embeddings_request), model_id=embeddings_request.model
+        )
         response_body = json.loads(response.get("body").read())
         if DEBUG:
             logger.info("Bedrock response body: " + str(response_body))
@@ -824,10 +1311,15 @@ class TitanEmbeddingsModel(BedrockEmbeddingsModel):
     def _parse_args(self, embeddings_request: EmbeddingsRequest) -> dict:
         if isinstance(embeddings_request.input, str):
             input_text = embeddings_request.input
-        elif isinstance(embeddings_request.input, list) and len(embeddings_request.input) == 1:
+        elif (
+            isinstance(embeddings_request.input, list)
+            and len(embeddings_request.input) == 1
+        ):
             input_text = embeddings_request.input[0]
         else:
-            raise ValueError("Amazon Titan Embeddings models support only single strings as input.")
+            raise ValueError(
+                "Amazon Titan Embeddings models support only single strings as input."
+            )
         args = {
             "inputText": input_text,
             # Note: inputImage is not supported!
@@ -841,7 +1333,9 @@ def _parse_args(self, embeddings_request: EmbeddingsRequest) -> dict:
         return args
 
     def embed(self, embeddings_request: EmbeddingsRequest) -> EmbeddingsResponse:
-        response = self._invoke_model(args=self._parse_args(embeddings_request), model_id=embeddings_request.model)
+        response = self._invoke_model(
+            args=self._parse_args(embeddings_request), model_id=embeddings_request.model
+        )
         response_body = json.loads(response.get("body").read())
         if DEBUG:
             logger.info("Bedrock response body: " + str(response_body))
@@ -860,6 +1354,8 @@ def get_embeddings_model(model_id: str) -> BedrockEmbeddingsModel:
     match model_name:
         case "Cohere Embed Multilingual" | "Cohere Embed English":
             return CohereEmbeddingsModel()
+        case "Titan Embeddings G2 - Text":
+            return TitanEmbeddingsModel()
         case _:
             logger.error("Unsupported model id " + model_id)
             raise HTTPException(
diff --git a/src/api/schema.py b/src/api/schema.py
index df805347..ffcbab9f 100644
--- a/src/api/schema.py
+++ b/src/api/schema.py
@@ -45,6 +45,11 @@ class ImageContent(BaseModel):
     image_url: ImageUrl
 
 
+class ToolContent(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+
+
 class SystemMessage(BaseModel):
     name: str | None = None
     role: Literal["system"] = "system"
@@ -66,7 +71,7 @@ class AssistantMessage(BaseModel):
 
 class ToolMessage(BaseModel):
     role: Literal["tool"] = "tool"
-    content: str
+    content: str | list[ToolContent] | list[dict]
     tool_call_id: str
 
 
@@ -92,8 +97,8 @@ class ChatRequest(BaseModel):
     presence_penalty: float | None = Field(default=0.0, le=2.0, ge=-2.0)  # Not used
     stream: bool | None = False
     stream_options: StreamOptions | None = None
-    temperature: float | None = Field(default=1.0, le=2.0, ge=0.0)
-    top_p: float | None = Field(default=1.0, le=1.0, ge=0.0)
+    temperature: float | None = Field(default=None, le=2.0, ge=0.0)
+    top_p: float | None = Field(default=None, le=1.0, ge=0.0)
     user: str | None = None  # Not used
     max_tokens: int | None = 2048
     max_completion_tokens: int | None = None
@@ -102,12 +107,27 @@ class ChatRequest(BaseModel):
     tools: list[Tool] | None = None
     tool_choice: str | object = "auto"
     stop: list[str] | str | None = None
+    extra_body: dict | None = None
+
+
+class PromptTokensDetails(BaseModel):
+    """Details about prompt tokens usage, following OpenAI API format."""
+    cached_tokens: int = 0
+    audio_tokens: int = 0
+
+
+class CompletionTokensDetails(BaseModel):
+    """Details about completion tokens usage, following OpenAI API format."""
+    reasoning_tokens: int = 0
+    audio_tokens: int = 0
 
 
 class Usage(BaseModel):
     prompt_tokens: int
     completion_tokens: int
     total_tokens: int
+    prompt_tokens_details: PromptTokensDetails | None = None
+    completion_tokens_details: CompletionTokensDetails | None = None
 
 
 class ChatResponseMessage(BaseModel):
diff --git a/src/api/setting.py b/src/api/setting.py
index e090300a..c69780b4 100644
--- a/src/api/setting.py
+++ b/src/api/setting.py
@@ -1,7 +1,5 @@
 import os
 
-DEFAULT_API_KEYS = "bedrock"
-
 API_ROUTE_PREFIX = os.environ.get("API_ROUTE_PREFIX", "/api/v1")
 
 TITLE = "Amazon Bedrock Proxy APIs"
@@ -16,3 +14,5 @@
 DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "anthropic.claude-3-sonnet-20240229-v1:0")
 DEFAULT_EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL", "cohere.embed-multilingual-v3")
 ENABLE_CROSS_REGION_INFERENCE = os.environ.get("ENABLE_CROSS_REGION_INFERENCE", "true").lower() != "false"
+ENABLE_APPLICATION_INFERENCE_PROFILES = os.environ.get("ENABLE_APPLICATION_INFERENCE_PROFILES", "true").lower() != "false"
+ENABLE_PROMPT_CACHING = os.environ.get("ENABLE_PROMPT_CACHING", "false").lower() != "false"
diff --git a/src/requirements.txt b/src/requirements.txt
index 7ad8fb10..9aa0e2da 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -1,9 +1,9 @@
-fastapi==0.115.8
-pydantic==2.7.1
+fastapi==0.116.1
+pydantic==2.11.4
 uvicorn==0.29.0
 mangum==0.17.0
-tiktoken==0.6.0
-requests==2.32.3
-numpy==1.26.4
-boto3==1.37.0
-botocore==1.37.0
\ No newline at end of file
+tiktoken==0.9.0
+requests==2.32.4
+numpy==2.2.5
+boto3==1.40.4
+botocore==1.40.4