Robotics object detection demo.
#include <agents-cpp/agent_context.h>
#include <agents-cpp/config_loader.h>
#include <agents-cpp/llm_interface.h>
#include <agents-cpp/logger.h>
#include <agents-cpp/tools/tool_registry.h>
String model =
"gemini-2.5-flash";
String api_key = config.get(
"GEMINI_API_KEY");
if (api_key.empty()) {
co_return EXIT_FAILURE;
}
auto context = std::make_shared<AgentContext>();
auto llm =
createLLM(provider, api_key, model);
llm->setOptions(options);
context->setLLM(llm);
context->setSystemPrompt(
"You are a robotics vision system specialized in object detection and spatial reasoning. "
"You can identify objects in images and provide precise 2D coordinates for robotic manipulation. "
"Always respond with valid JSON format as requested."
);
try {
Logger::info(
"Detecting objects and providing 2D coordinates...");
Point to no more than 10 items in the image. The label returned
should be an identifying name for the object detected.
The answer should follow the json format: [{"point": [y, x], "label": "<label1>"}, ...].
The points are in [y, x] format normalized to 0-1000.
)";
auto object_detection_resp = co_await context->chatMultiModal(
robotics_prompt,
{ "file://" + media_path + "/scenes/synthetic_table.png" }
);
Logger::info(
"Finding specific objects based on natural language commands...");
Find the banana in the image. Return the coordinates and label in JSON format:
[{"point": [y, x], "label": "banana"}]
)";
auto find_resp = co_await context->chatMultiModal(
find_prompt,
{ "file://" + media_path + "/scenes/synthetic_table.png" }
);
Logger::info(
"\n=== Object Detection & Bounding Boxes Demo ===");
Logger::info(
"Analyzing scene for objects and their bounding boxes...");
String bounding_boxes_prompt = R
"(
Analyze objects in this scene and return bounding boxes as a JSON array with labels.
Never return masks or code fencing. Limit to 25 objects. Include as many objects as you
can identify on the table.
If an object is present multiple times, name them according to their
unique characteristic (colors, size, position, unique characteristics, etc..).
The format should be as follows: [{"box_2d": [ymin, xmin, ymax, xmax],
"label": <label for the object>}] normalized to 0-1000. The values in
box_2d must only be integers
)";
auto bounding_boxes_resp = co_await context->chatMultiModal(
bounding_boxes_prompt,
{ "file://" + media_path + "/scenes/synthetic_table.png" }
);
Logger::info(
"Analyzing scene for robotic manipulation...");
Analyze this scene for robotic manipulation. Identify:
1. A few graspable objects with their coordinates
2. Potential obstacles or hazards
3. Objects that can be picked up together
4. Objects that require careful handling
Return results in JSON format with categories and coordinates.
)";
auto scene_resp = co_await context->chatMultiModal(
scene_prompt,
{ "file://" + media_path + "/scenes/synthetic_table.png" }
);
} catch (const std::exception& e) {
co_return EXIT_FAILURE;
}
co_return EXIT_SUCCESS;
}
int main(int argc, char**argv) {
if (argc != 2)
{
Logger::error(
"Usage: ./robotics_object_detection_demo <absolute_path_to_media_dir>");
return EXIT_FAILURE;
}
}
static ConfigLoader & getInstance()
Get the singleton instance of ConfigLoader.
@ INFO
Info logging level.
Definition logger.h:40
static void error(fmt::format_string< Args... > fmt, Args &&... args)
Log a message at error level.
Definition logger.h:124
static void init(Level level=Level::INFO)
Initialize the logger.
static void info(fmt::format_string< Args... > fmt, Args &&... args)
Log a message at info level.
Definition logger.h:104
Standard C++20 coroutine-based Task implementation (no external deps)
Definition coroutine_utils.h:38
Framework Namespace.
Definition agent.h:18
std::shared_ptr< LLMInterface > createLLM(const String &provider, const String &api_key, const String &model="")
Factory function to create a specific LLM provider.
std::string String
String type.
Definition types.h:27
T blockingWait(Task< T > &&task)
Helper to run a coroutine and get the result synchronously.
Definition coroutine_utils.h:496
Options for LLM API calls.
Definition llm_interface.h:25
double temperature
The temperature of the LLM.
Definition llm_interface.h:29
int max_tokens
The maximum number of tokens.
Definition llm_interface.h:33