Agents 0.0.2
Edge AI Agents SDK
Loading...
Searching...
No Matches
multimodal_example.cpp

Multimodal understanding example with image, audio, and video.

Multimodal understanding example with image, audio, and video

Version
0.1
Date
2025-09-21
#include <agents-cpp/config_loader.h>
#include <agents-cpp/logger.h>
#include <agents-cpp/tools/tool_registry.h>
#include <iostream>
using namespace agents;
Task<int> runmultimodalExample(String media_path) {
// Initialize logger
auto& config = ConfigLoader::getInstance();
// Choose provider here
String provider = "google";
String model = "gemini-2.5-flash";
String api_key = config.get("GEMINI_API_KEY");
if (api_key.empty()) {
std::cerr << "GEMINI_API_KEY not set.\n";
co_return EXIT_FAILURE;
}
// Create the context
auto context = std::make_shared<AgentContext>();
// Create the LLM
auto llm = createLLM(provider, api_key, model);
// Configure LLM options
LLMOptions options;
options.temperature = 0.7;
llm->setOptions(options);
// Set the LLM
context->setLLM(llm);
// Register tools
context->registerTool(tools::createMediaLoaderTool(llm));
// Set the system prompt
context->setSystemPrompt(
"You are a friendly assistant that helps users find information and answer questions. "
"Use the tools available to you to load files, gather information, and provide comprehensive answers. "
);
// Lambda for printing result stream to console
auto printStream = [&](AsyncGenerator<String>& generator) -> Task<void> {
// Display the result as it arrives
while (auto item = co_await generator.next()) {
String chunk = *item;
std::cout << chunk << std::flush;
}
std::cout << std::endl << std::endl;
};
// Examples demonstrating chat with media URI; prefer https:// or file:// for large images
// Alternatively, provide the base64 encoded media directly using:
// data:<mime_type>;base64,<data>
// For example: "..."
try {
// Example 1: Multimodal Chat With Audio (unified API)
auto audio_resp = context->streamChatMultiModal(
"What is in this audio ?",
{ "file://" + media_path + "/audio/sample.mp3" }
);
co_await printStream(audio_resp);
// Example 2: Multimodal Chat With Video (unified API)
auto video_resp = context->streamChatMultiModal(
"What is this video about ?",
{ "file://" + media_path + "/video/sample_video.mp4" }
);
co_await printStream(video_resp);
// Example 3: Multimodal Chat With Multiple Images (unified API)
auto image_resp = context->streamChatMultiModal(
"What is happening in these images?",
{
"https://i.ytimg.com/vi/Eb4ICVPOUlI/hqdefault.jpg",
"file://" + media_path + "/scenes/robotics_scene.png",
""
}
);
co_await printStream(image_resp);
} catch (const std::exception& e) {
Logger::error("Error: {}", e.what());
}
co_return EXIT_SUCCESS;
}
int main(int argc, char**argv) {
if (argc != 2)
{
Logger::error("Usage: ./multimodal_example <absolute_path_to_media_dir>");
return EXIT_FAILURE;
}
return blockingWait(runmultimodalExample(String(argv[1])));
}
AsyncGenerator with a Folly-compatible API: next() returns Task<optional<T>>
Definition coroutine_utils.h:379
static ConfigLoader & getInstance()
Get the singleton instance of ConfigLoader.
@ INFO
Info logging level.
Definition logger.h:40
static void error(fmt::format_string< Args... > fmt, Args &&... args)
Log a message at error level.
Definition logger.h:124
static void init(Level level=Level::INFO)
Initialize the logger.
Standard C++20 coroutine-based Task implementation (no external deps)
Definition coroutine_utils.h:38
std::shared_ptr< Tool > createMediaLoaderTool(std::shared_ptr< LLMInterface > llm)
Creates a tool for loading media from URLs or local files.
Framework Namespace.
Definition agent.h:18
std::shared_ptr< LLMInterface > createLLM(const String &provider, const String &api_key, const String &model="")
Factory function to create a specific LLM provider.
std::string String
String type.
Definition types.h:27
T blockingWait(Task< T > &&task)
Helper to run a coroutine and get the result synchronously.
Definition coroutine_utils.h:496
Options for LLM API calls.
Definition llm_interface.h:25
double temperature
The temperature of the LLM.
Definition llm_interface.h:29