import requests
import base64
import json
from IPython.display import Image
api_key = '<api-key>'
base_url = '<endpoint>'
API Examples¶
The api is a REST API that exposes HTTP endpoints. The following demonstrates using Python's request library, but you can use any technique that is capable of making HTTP requests to access the API.
Authenticating & Headers¶
Your api key should be passed as a Bearer token in the authorization header.
The value of the Content-Type header should be application/json.
endpoint = f'{base_url}/api/v1/chat/completions'
headers = {'accept': 'application/json', 'authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
Minimal Chat Example¶
The simplest possible chat with just a model and a user prompt.
data = {'model': 'llama_4_maverick', 'messages': [{'role': 'user', 'content': 'Hello'}]}
resp = requests.post(endpoint, json=data, headers=headers)
if resp.status_code >= 300:
print('error', resp.status_code)
response_object = resp.json()
response_object
{'id': 'chatcmpl-688626cc-fb3e-477a-9a0d-1da7ea49bb87',
'object': 'chat.completion',
'created': 1772229798,
'model': 'llama_4_maverick',
'choices': [{'index': 0,
'message': {'role': 'assistant',
'content': "Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?"},
'finish_reason': 'stop'}],
'usage': {'prompt_tokens': 36, 'completion_tokens': 24, 'total_tokens': 60}}
Using a system prompt¶
The messages array can contain mutlple content items of various types. Including a system prompt influences how the model behaves.
data = {
'model': 'llama_4_maverick',
'messages': [
{'role': 'system', 'content': 'You are a pirate and only speak in pirate'},
{'role': 'user', 'content': 'Hello'},
],
}
resp = requests.post(endpoint, json=data, headers=headers)
if resp.status_code >= 300:
print('error', resp.status_code)
response_object = resp.json()
print(response_object['choices'][0]['message']['content'])
Arrrr, 'ello there, matey! Yer lookin' fer a swashbucklin' adventure, eh? Yer in fer a treat, I be Captain Blackbeak, the scurviest pirate to ever sail the seven seas! What be bringin' ye to these waters?
Including image data with prompt¶
Images must be encoded as a data URI likedata:image/jpg;base64,<base64 data>
Include the image as another content item with
{
"type": "image_url",
"image_url": {
"url":encoded # your base64 encoded string
},
"detail": "auto"
}
Below we have a request object with three items:
- System prompt
- User prompt
- Image content
The example also demonstrates crafting the prompt to encourage the model to respond with valid JSON data.
seal_image_path = 'images/ous_white.jpg'
stapler_image_path = 'images/red_stapler.jpg'
Image(filename=seal_image_path)
def make_prompt(image_path, description):
image = Image(filename=image_path)
image_bytes = image.data
# encode image bytes as base64
# and create a data url
s = base64.b64encode(image_bytes).decode('utf-8')
encoded = 'data:image/png;base64,' + s
return {
'model': 'gemini-2.5-flash',
'messages': [
{
'role': 'system',
'content': """
I want to directly parse your response as json. Do not include any other text other than the plain json.
Do not not include the word json, do not include new line characters. Only the exact json text.
""",
},
{
'role': 'user',
'content': [
{
'type': 'text',
'text': f"""I have the following description of this image:
{description}
I would like to know if this description as genally correct. The description does not need to be
specfic or detailed. The general category is still considered correct if it matches
the image. Format your answer as a json object with two pieces of information, a boolean indicating
whether the teh description matches, and a description, which is your identification of the image:
{{"is_correct": boolean indicating if the description is generally correct, "description":your description of the image.}}.
""",
},
{'type': 'image_url', 'image_url': {'url': encoded}, 'detail': 'auto'},
],
},
],
}
prompt = make_prompt(seal_image_path, 'a red stapler')
resp = requests.post(endpoint, json=prompt, headers=headers)
response_object = resp.json()
response_content = response_object['choices'][0]['message']['content']
json.loads(response_content)
{'is_correct': False,
'description': "A circular emblem or seal featuring a cartoon computer with a grumpy face, surrounded by stars, and the text 'OFFICE OF UNSPECIFIED SERVICES' on a blue and gold ribbon."}
Including documents¶
Similar to images, documents are included as another content item with base 64 data.
def make_document_prompt(model, document_path, prompt):
with open(document_path, 'rb') as f:
pdf_bytes = f.read()
pdf_base_64 = base64.b64encode(pdf_bytes).decode('utf-8')
encoded = 'data:application/pdf;base64,' + pdf_base_64
return {
'model': model,
'messages': [
{
'role': 'user',
'content': [
{'type': 'text', 'text': prompt},
{'type': 'file', 'file': {'file_name': 'Test', 'file_data': encoded}},
],
}
],
}
prompt = make_document_prompt('gemini-2.5-flash', 'documents/AWorldofLovesample.pdf', 'Please summarize this document.')
resp = requests.post(endpoint, json=prompt, headers=headers)
print(resp)
response_object = resp.json()
response_object['choices'][0]['message']['content']
<Response [200]>
'The document vividly describes an unusual June sunrise over an Irish landscape, highlighting a river gorge, distant mountains, and the mansion of Montefort. Montefort itself is depicted as a once grand estate now in picturesque disrepair, with neglected grounds and dilapidated farm buildings contrasting with remnants of its former elegance, like a stone archway, and its dramatic position overlooking the gorge.\n\nA 20-year-old woman named Jane emerges from the house, dressed in an anachronistic Edwardian muslin dress. She walks to an obelisk, rereads a letter, and then intently observes two drawn upstairs windows of Montefort. The text concludes with an intimate glimpse inside a dimly lit, claret-red room, featuring a large four-poster bed and an assortment of personal effects on a bedside table, hinting at a complex inner world or domestic setting.'
Model Differences¶
Different models have different capabilities. The documentAWorldofLovesample.pdf, which we passed to the model above, is a scanned PDF that only contains an image of the text — it doesn't contain any text content. Gemini is capable of performing OCR on the image and extracting text from it.
On the other hand, Llama does not do this (at the moment) and responds that it sees a blank doucment. To effectively use this API, it is important to understand the differences between the underlying models.
prompt = make_document_prompt('llama_4_maverick', 'documents/AWorldofLovesample.pdf', 'Please summarize this document.')
resp = requests.post(endpoint, json=prompt, headers=headers)
response_object = resp.json()
response_object['choices'][0]['message']['content']
'There is no document content to summarize. The document "Untitled.pdf" is listed, but its content is empty.'
Tool Calls¶
You can provide the API with functions that it can ask you to call if it needs more information. This is the basis for agentic AI and is a powerful technique for making domain-specific information available to models.
This requires two calls to the api:
- Pass the prompt along with a list of tools available to the LLM
- Extract any tools the LLM would like to call from its response
- Call this function(s) with the provided arguments
- Call the API again with the results of the function exectution
import random
# An arbitrary function you would like the LLM to have at its disposal.
# This function could be anything — something that interfaces with a DB, calls an API, etc.
# This is a toy example — there's nothing magic about this number, but the LLM doesn't nee to know that.
def magic_number(factor):
"""Generate a magic number that is a multiple of factor"""
return random.randint(10000, 50000) * factor
# Tell the LLM about this function and expected paramters
prompt = {
'model': 'gemini-2.5-flash',
'messages': [{'role': 'user', 'content': 'Generate a magic number that is a multiple of 100'}],
'tools': [
{
'type': 'function',
'function': {
'name': 'magic_number',
'description': 'Generates a magic number that is a mutiple of a given factor',
'parameters': {
'type': 'object',
'properties': {
'factor': {
'type': 'number',
'description': 'The number that will be a factor of the magic number',
}
},
'required': ['factor'],
},
},
}
],
}
resp = requests.post(endpoint, json=prompt, headers=headers)
response_object = resp.json()
# The LLM responds in the `tool_calls` section of the response that it
# would like to call this function in order to answer the user's prompt
function_call = response_object['choices'][0]['message']['tool_calls'][0]
tool_id = function_call['id']
tool_name = function_call['function']['name']
args = function_call['function']['arguments']
print('id:', tool_id, 'name:', tool_name, 'args:', args)
id: call_3757dadc6d name: magic_number args: {"factor": 100}
# Call the function
args = json.loads(args)
func_result = {'magic_number': magic_number(args['factor'])}
func_result
{'magic_number': 2683700}
# Craft a prompt that contains the result of the tool call
prompt = {
'model': 'gemini-2.5-flash',
'messages': [
{'role': 'user', 'content': 'Generate a magic number that is a multiple of 100'},
{
'role': 'assistant',
'tool_calls': [
{'id': tool_id, 'type': 'function', 'function': {'name': tool_name, 'arguments': json.dumps(args)}}
],
},
{'role': 'tool', 'tool_call_id': tool_id, 'content': json.dumps(func_result)},
],
}
resp = requests.post(endpoint, json=prompt, headers=headers)
response_object = resp.json()
# The LLM can now answer the original prompt
# using the result obtained from calling the funtion
response_object['choices'][0]['message']
{'role': 'assistant', 'content': 'The magic number is 2683700.'}
Using third party libraries for tool calls¶
As you can see above, managing state and proper formats over the course of even a toy example can be tricky.
Since this API is modelled after the OpenAI Chat Completions API, you can use many tools that expect this format.
Here's an example of using Pydantic AI to simplify calling the same tool we created above. You will need to install the Pydantic AI python library for the following to work.
import random
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider
model = OpenAIChatModel('gemini-2.5-flash', provider=OpenAIProvider(base_url=base_url + '/api/v1', api_key=api_key))
agent = Agent(model)
# Use Pydantic's decorator to make this function
# available as a tool to the LLM.
@agent.tool_plain
def magic_number(factor: int) -> int:
"""Generate a magic number that is a multiple of factor"""
print('Calling magin number tool with factor: ', factor)
return random.randint(10000, 50000) * factor
# We can give it more than one tool to use
@agent.tool_plain
def special_number() -> int:
"""Generate a special number"""
sn = random.randint(1, 20)
print('Calling tool to get special number which is:', sn)
return sn
resp = await agent.run(
'Please get a special number and then give me a magic number that is a mutiple of the special number',
)
print(resp.output)
Calling tool to get special number which is: 15 Calling magin number tool with factor: 15 The special number is 15. The magic number is 438000.
Embeddings¶
endpoint = f'{base_url}/api/v1/embeddings'
headers = {'accept': 'application/json', 'authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
request = {
'model': 'cohere_english_v3',
'encodingFormat': 'float',
'input': [
'Sweet is the breath of morn, her rising sweet, With charm of earliest birds',
'With Midnight to the North of Her — And Midnight to the South of Her — And Maelstrom — in the Sky',
],
'input_type': 'search_document',
}
resp = requests.post(endpoint, json=request, headers=headers)
result = resp.json()
result['data'][0]['embedding'][0:10]
[-0.0036468506, -0.023422241, -0.038757324, -0.038635254, -0.008544922, -0.0076293945, 0.004508972, 0.022476196, 0.0019721985, 0.017059326]
Using embeddings for simple clustering¶
Embeddings are frequently used for Retrieval Augmented Generation (RAG)) applications. But the semantic data captured in the embedding vectors has uses beyond RAG.
This is a simple example that creates embeddings for 96 headlines. (We choose 96 here for simplcity since Cohere allows us to embed up to 96 documents in one call). These embeddings are put through a dimension reduction algorithm and then plotted showing the clustering based on semantic information.
# If you need to install these:
# intall umap-learn not umap
# !pip -q install datasets
import random
import numpy as np
import matplotlib.pyplot as plt
import umap.umap_ as umap
from datasets import load_dataset
Create a small corpus of new headlines¶
This uses a dataset provided by Hugging Face datatest and selects a random sample with equal representation from three catagories.
dataset = load_dataset('ag_news', split='train') # 120k rows
label_names = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
target_labels = [0, 1, 3] # World, Sports, Sci/Tech
sample_size = 32
rows = []
for lbl in target_labels:
subset = [r for r in dataset if r['label'] == lbl]
rows.extend(random.sample(subset, sample_size))
texts = [r['text'] for r in rows]
labels = [label_names[r['label']] for r in rows]
Downloading readme: 0%| | 0.00/8.07k [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/18.6M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/1.23M [00:00<?, ?B/s]
Generating train split: 0%| | 0/120000 [00:00<?, ? examples/s]
Generating test split: 0%| | 0/7600 [00:00<?, ? examples/s]
print(f'Loaded {len(texts)} headlines: {[f"{label}={labels.count(label)}" for label in set(labels)]}\n')
print('Example text:', texts[0])
Loaded 96 headlines: ['World=32', 'Sci/Tech=32', 'Sports=32'] Example text: Budget fight deals setback to Sharon The government of Prime Minister Ariel Sharon was thrown into crisis last night after Israel #39;s parliament rejected the proposed 2005 budget and Sharon
Use the API to get embeddings for each headline¶
Because we limited the number of headlines to 96, we can do this in one request to the API. If we had a larger set, we would need to split them into groups of 96. If the documents were larger than headlines, we would also need to split documents into a size smaller than the Cohere token limit.
request = {'model': 'cohere_english_v3', 'encodingFormat': 'float', 'input': texts, 'input_type': 'clustering'}
resp = requests.post(endpoint, json=request, headers=headers)
result = resp.json()
raw_embeddings = [data['embedding'] for data in result['data']]
embeddings = np.array(raw_embeddings)
embeddings.shape
(96, 1024)
Reduce dimensions¶
Cohere embeddings have 1024 dimensions. This allows the embedding to capture semantic meaning on many different layers, but also makes it challenging to visualize. UMAP learn uses a non-linear dimensionality-reduction algorithm that tries to keep points that are close together in the high-dimensional space close together in the low-dimensional map. This is useful for visualizations.
From the plot below, we can see that we were able to effectively cluster these headlines by category just from the text of the headlines.
reducer = umap.UMAP(random_state=42, n_jobs=1, min_dist=0.2)
emb_2d = reducer.fit_transform(embeddings)
plt.figure(figsize=(9, 7))
unique_labels = sorted(set(labels))
colors = {lbl: idx for idx, lbl in enumerate(unique_labels)}
for lbl in unique_labels:
idxs = [i for i, label in enumerate(labels) if label == lbl]
plt.scatter(emb_2d[idxs, 0], emb_2d[idxs, 1], label=lbl, s=45)
plt.title('UMAP projection of 96 AG‑News headlines')
plt.legend()
plt.axis('off')
plt.show()