Visual Question Answering (VQA) enables you to ask questions about images and receive intelligent, context-aware responses. Using Gemini’s multimodal capabilities, you can:
from google import genaifrom google.genai.types import Part, GenerateContentConfigfrom IPython.display import Image as IPImage, display, Markdown# Initialize clientPROJECT_ID = "your-project-id"LOCATION = "us-central1"client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)# Use a Gemini model with vision capabilitiesMODEL_ID = "gemini-2.5-flash"
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "Generate concise alt text for this image for web accessibility.", ],)alt_text = response.textprint(f'<img src="image.jpg" alt="{alt_text}">')
with open("street_scene.jpg", "rb") as f: image_data = f.read()response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "How many people are in this image?", ],)print(response.text)
questions = [ "What is the main subject of this image?", "What time of day does it appear to be?", "What emotions or mood does this image convey?",]for question in questions: response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), question, ], ) print(f"Q: {question}") print(f"A: {response.text}\n")
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "List all the distinct objects you can identify in this image.", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "Count the number of cars visible in this parking lot image.", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "Describe the spatial relationship between the objects in this image. What is in front, behind, left, and right?", ],)print(response.text)
with open("document.jpg", "rb") as f: image_data = f.read()response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "Extract all the text visible in this image.", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), """Extract the text from this business card and format it as: Name: Title: Company: Email: Phone:""", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "Extract all text from this image and identify the language(s) used.", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), """Analyze this scene: 1. What type of location is this? 2. What activities are taking place? 3. What is the overall atmosphere? 4. What time period does this appear to be from?""", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "What activities or actions are the people in this image performing?", ],)print(response.text)
with open("image1.jpg", "rb") as f: image1_data = f.read()with open("image2.jpg", "rb") as f: image2_data = f.read()response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image1_data, mime_type="image/jpeg"), Part.from_bytes(data=image2_data, mime_type="image/jpeg"), "Compare these two images. What are the similarities and differences?", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), "This is a furniture assembly diagram. Explain the steps shown in sequence.", ],)print(response.text)
response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=chart_data, mime_type="image/png"), """Analyze this chart: 1. What type of chart is this? 2. What are the key trends? 3. What insights can you extract? 4. Are there any notable anomalies?""", ],)print(response.text)
image_uris = [ "gs://your-bucket/images/img1.jpg", "gs://your-bucket/images/img2.jpg", "gs://your-bucket/images/img3.jpg",]for uri in image_uris: response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_uri(file_uri=uri, mime_type="image/jpeg"), "What is the main subject of this image?", ], ) print(f"{uri}: {response.text}\n")
For specialized captioning tasks, you can fine-tune Gemini models:
from vertexai.preview.tuning import sft# Prepare training data in JSONL format# Each line: {"contents": [{"role": "user", "parts": [{"fileData": {...}}, {"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}sft_tuning_job = sft.train( source_model="gemini-2.5-flash", train_dataset="gs://your-bucket/training_data.jsonl", validation_dataset="gs://your-bucket/validation_data.jsonl", epochs=4, learning_rate_multiplier=1.0,)# Wait for training to completesft_tuning_job.wait()# Use the tuned modeltuned_model = sft_tuning_job.tuned_model
Fine-tuning is ideal when you need domain-specific captions (e.g., medical images, fashion products, or technical diagrams).
Here’s a comprehensive example combining multiple VQA capabilities:
from google import genaifrom google.genai.types import Partimport json# InitializePROJECT_ID = "your-project-id"client = genai.Client(vertexai=True, project=PROJECT_ID, location="us-central1")MODEL_ID = "gemini-2.5-flash"# Load imagewith open("scene.jpg", "rb") as f: image_data = f.read()# Comprehensive analysisanalysis_prompt = """Analyze this image comprehensively:1. Scene Description: Provide a detailed description of the scene.2. Objects: List all identifiable objects.3. People: How many people are present? What are they doing?4. Text: Extract any visible text.5. Colors: What are the dominant colors?6. Mood: Describe the overall mood or atmosphere.7. Time/Setting: When and where does this appear to be?Provide your analysis in a clear, structured format."""response = client.models.generate_content( model=MODEL_ID, contents=[ Part.from_bytes(data=image_data, mime_type="image/jpeg"), analysis_prompt, ],)print("=== Comprehensive Image Analysis ===")print(response.text)