Skip to content

Commit

Permalink
Added support for GPT-4o
Browse files Browse the repository at this point in the history
  • Loading branch information
waltervanheuven committed May 15, 2024
1 parent 601c336 commit 55b89f8
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 22 deletions.
18 changes: 10 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,18 +133,20 @@ python source/auto_alt_text.py pptx/test1.pptx --show_openclip_models
python source/auto_alt_text.py pptx/test1.pptx --model openclip --openclip_model coca_ViT-L-14 --openclip_pretrained mscoco_finetuned_laion2B-s13B-b90k
```

## GPT-4V
## OpenAI Vision models

To use [GPT-4V](https://openai.com/research/gpt-4v-system-card) you need to have [API access](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4). Images will be send to OpenAI servers for inference. Costs for using the API depend on the size and number the images. API access [pricing information](https://openai.com/pricing#language-models). The script uses the OPENAI_API_KEY environment variable. Information how to set/add this variable can be found in the [OpenAI quickstart docs](https://platform.openai.com/docs/quickstart?context=python).
To use [OpenAI](https://openai.com)'s models that support vision ([GPT-4o](https://openai.com/index/hello-gpt-4o/), GPT-4 Turbo) you need to have [API access](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4). Images will be send to OpenAI servers for inference. Costs for using the API depends on the size and number the images. API access [pricing information](https://openai.com/pricing#language-models). The script uses the OPENAI_API_KEY environment variable. Information how to set/add this variable can be found in the [OpenAI quickstart docs](https://platform.openai.com/docs/quickstart?context=python).

To use GPT-4o, use `--model gpt-4o`, for GPT-4 Turbo, use `--model gpt-4-turbo`.

```sh
python source/auto_alt_text.py pptx/test1.pptx --model gpt-4v
python source/auto_alt_text.py pptx/test1.pptx --model gpt-4o

# custom prompt
python source/auto_alt_text.py pptx/test1.pptx --model gpt-4v --prompt "Describe clearly in two sentences"
python source/auto_alt_text.py pptx/test1.pptx --model gpt-4o --prompt "Provide an image caption"
```

## LLaVA and other multimodal LLMs
## LLaVA and other Multimodal LLMs

LLaVA or other multimodal large language models can be used through [Ollama](https://ollama.com/). These models will run locally. Which model you can use depends on the capabilities of your computer (e.g. memory, GPU).

Expand All @@ -163,16 +165,16 @@ ollama list
### Example of using LLaVA through Ollama

```sh
python source/auto_alt_text.py pptx/test1.pptx --model llava --use_ollama
python source/auto_alt_text.py pptx/test1.pptx --model llava --use_ollama

# to disable default image resizing to 500px x 500px, set resize size to 0
python source/auto_alt_text.py pptx/test1.pptx --model llava --use_ollama --resize 0
python source/auto_alt_text.py pptx/test1.pptx --model llava --use_ollama --resize 0

# specify a different prompt
python source/auto_alt_text.py pptx/test1.pptx --model llava --use_ollama --prompt "Describe in simple words using one sentence."

# specify differ server or port of the ollama server, default server is localhost, and port is 11434
python source/auto_alt_text.py pptx/test1.pptx --model llava --use_ollama --server my_server.com --port 3456
python source/auto_alt_text.py pptx/test1.pptx --model llava --use_ollama --server my_server.com --port 3456
```

## Edit generated alt texts and apply to Powerpoint file
Expand Down
30 changes: 16 additions & 14 deletions source/auto_alt_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,24 @@
import re
import pathlib
import requests
from urllib3.exceptions import HTTPError
#from urllib3.exceptions import HTTPError
from PIL import Image
import psutil
import open_clip
import torch
#import ollama
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
#from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from transformers.generation import GenerationConfig
from openai import OpenAI
from pptx import Presentation
from pptx.util import Cm
from pptx.oxml.ns import _nsmap
from pptx.enum.shapes import MSO_SHAPE_TYPE
from pptx.shapes.base import BaseShape


def check_server_is_running(url: str) -> bool:
""" URL accessible? """
status:bool = False
Expand Down Expand Up @@ -346,9 +349,9 @@ def init_model(settings: dict) -> bool:
settings["cogvlm-model"] = model
settings["cogvlm-tokenizer"] = tokenizer

elif model_str == "gpt-4v":
print("GPT-4V")
print(f"model: {settings['gpt4v_model']}")
elif model_str == "gpt-4o" or model_str == "gpt-4-turbo":
print("OpenAI")
print(f"model: {model_str}")
print(f"prompt: '{prompt}'")
else:
print(f"Unknown model: '{model_str}'")
Expand Down Expand Up @@ -852,8 +855,8 @@ def generate_description(image_file_path: str, extension:str, settings: dict, fo
alt_text, err = qwen_vl(image_file_path, settings, for_notes, debug)
elif model_str == "cogvlm":
alt_text, err = cog_vlm(image_file_path, settings, for_notes, debug)
elif model_str == "gpt-4v":
alt_text, err = gpt4v(image_file_path, extension, settings, for_notes, debug)
elif model_str == "gpt-4o" or model_str == "gpt-4-turbo":
alt_text, err = use_openai(image_file_path, extension, settings, for_notes, debug)
else:
print(f"Unknown model: {model_str}")

Expand Down Expand Up @@ -1157,7 +1160,7 @@ def img_file_to_base64(image_file_path:str , settings: dict, debug:bool=False) -

return base64_str

def gpt4v(image_file_path: str, extension:str, settings: dict, for_notes:bool=False, debug:bool=False) -> Tuple[str, bool]:
def use_openai(image_file_path: str, extension:str, settings: dict, for_notes:bool=False, debug:bool=False) -> Tuple[str, bool]:
""" get image description from GPT-4V """
err:bool = False
alt_text:str = "Error"
Expand Down Expand Up @@ -1200,7 +1203,7 @@ def gpt4v(image_file_path: str, extension:str, settings: dict, for_notes:bool=Fa
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": settings["gpt4v_model"],
"model": settings["model"],
"messages": [
{
"role": "user",
Expand All @@ -1221,9 +1224,9 @@ def gpt4v(image_file_path: str, extension:str, settings: dict, for_notes:bool=Fa
"max_tokens": 300
}

gpt4v_server = "https://api.openai.com/v1/chat/completions"
openai_server = "https://api.openai.com/v1/chat/completions"
try:
response = requests.post(gpt4v_server, headers=headers, json=payload, timeout=20)
response = requests.post(openai_server, headers=headers, json=payload, timeout=20)

json_out = response.json()

Expand All @@ -1237,7 +1240,7 @@ def gpt4v(image_file_path: str, extension:str, settings: dict, for_notes:bool=Fa
else:
alt_text = json_out["choices"][0]["message"]["content"]
except requests.exceptions.ConnectionError:
print(f"ConnectionError: Unable to access the server at: '{gpt4v_server}'")
print(f"ConnectionError: Unable to access the server at: '{openai_server}'")
err = True
except TimeoutError:
print("TimeoutError")
Expand Down Expand Up @@ -1861,7 +1864,7 @@ def main() -> int:
return int(err)

# set default prompt
if model_str == "gpt-4v":
if model_str == "gpt-4-turbo" or model_str == "gpt-4o":
if args.prompt == "":
prompt = "Describe the image using one or two sentences. Do not mention the word 'image'."
elif model_str == "kosmos-2":
Expand Down Expand Up @@ -1907,7 +1910,6 @@ def main() -> int:
"cogvlm-tokenizer": None,
"use_ollama": args.use_ollama,
"ollama_url": f"{args.server}:{args.port}",
"gpt4v_model": "gpt-4-vision-preview",
"cuda_available": torch.cuda.is_available(),
"mps_available": torch.backends.mps.is_available(),
"prompt": prompt,
Expand Down

0 comments on commit 55b89f8

Please sign in to comment.