from pydantic import BaseModel, Field
SCREENSHOT_URL = "https://your-host/screenshot.png" # or "data:image/png;base64,..."
SCREENSHOT_WIDTH, SCREENSHOT_HEIGHT = 1280, 720
ELEMENT = "the 'Sign in' button in the top-right corner"
class VisualLocalizerOutput(BaseModel):
x: int = Field(ge=0, le=1000, description="X coordinate as integer in [0, 1000]")
y: int = Field(ge=0, le=1000, description="Y coordinate as integer in [0, 1000]")
schema = VisualLocalizerOutput.model_json_schema()
prompt = (
"Localize an element on the GUI image according to the provided target "
"and output a click position.\n"
f" * You must output a valid JSON following the format: {schema}\n"
f" Your target is:\n{ELEMENT}"
)
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": SCREENSHOT_URL}},
{"type": "text", "text": prompt},
],
}],
extra_body={
"structured_outputs": {"json": schema},
"chat_template_kwargs": {"enable_thinking": False},
},
temperature=0.0,
)
point = VisualLocalizerOutput.model_validate_json(response.choices[0].message.content)
abs_x = int(point.x / 1000 * SCREENSHOT_WIDTH)
abs_y = int(point.y / 1000 * SCREENSHOT_HEIGHT)
print(f"Click at ({abs_x}, {abs_y})")