diff --git a/common/arg.cpp b/common/arg.cpp index 1b6884781d..0213a67c80 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2221,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD")); add_opt(common_arg( - {"--image", "--audio"}, "FILE", - "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n", + {"--image", "--audio", "--video"}, "FILE", + "path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n", [](common_params & params, const std::string & value) { for (const auto & item : parse_csv_row(value)) { params.image.emplace_back(item); diff --git a/common/common.h b/common/common.h index 13f387271d..b732a2087d 100644 --- a/common/common.h +++ b/common/common.h @@ -571,7 +571,7 @@ struct common_params { struct common_params_model mmproj; bool mmproj_use_gpu = true; // use GPU for multimodal model bool no_mmproj = false; // explicitly disable multimodal model - std::vector image; // path to image file(s) + std::vector image; // path to image file(s) ; TODO: change the name to "media" int image_min_tokens = -1; int image_max_tokens = -1; diff --git a/tests/test-mtmd-c-api.c b/tests/test-mtmd-c-api.c index b49498c87c..46a038f4a5 100644 --- a/tests/test-mtmd-c-api.c +++ b/tests/test-mtmd-c-api.c @@ -2,6 +2,7 @@ #include #include "mtmd.h" +#include "mtmd-helper.h" int main(void) { printf("\n\nTesting libmtmd C API...\n"); @@ -17,6 +18,11 @@ int main(void) { return 1; } + // simple test for the helper + size_t n_tokens_total = mtmd_helper_get_n_tokens(chunks); + printf("Total tokens in chunks: %zu\n", n_tokens_total); + assert(n_tokens_total > 0); + size_t n_chunks = mtmd_input_chunks_size(chunks); printf("Number of chunks: %zu\n", n_chunks); assert(n_chunks > 0); diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index a164b3bb91..3ed345bf0f 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -235,7 +235,7 @@ struct cli_context { }; // TODO?: Make this reusable, enums, docs -static const std::array cmds = { +static const std::array cmds = { "/audio ", "/clear", "/exit", @@ -243,6 +243,7 @@ static const std::array cmds = { "/image ", "/read ", "/regen", + "/video ", }; static std::vector> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) { @@ -457,6 +458,9 @@ int llama_cli(int argc, char ** argv) { if (inf.has_inp_audio) { console::log(" /audio add an audio file\n"); } + if (inf.has_inp_video) { + console::log(" /video add a video file\n"); + } console::log("\n"); // interactive loop @@ -553,7 +557,8 @@ int llama_cli(int argc, char ** argv) { continue; } else if ( (string_starts_with(buffer, "/image ") && inf.has_inp_image) || - (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) { + (string_starts_with(buffer, "/audio ") && inf.has_inp_audio) || + (string_starts_with(buffer, "/video ") && inf.has_inp_video)) { // just in case (bad copy-paste for example), we strip all trailing/leading spaces std::string fname = string_strip(buffer.substr(7)); std::string marker = ctx_cli.load_input_file(fname, true); diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 20c5317863..09b62357f3 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -1,5 +1,8 @@ # mtmd +set(MTMD_VIDEO ON CACHE BOOL "enable video support in mtmd (requires ffmpeg binary in PATH)") +# TODO: add MTMD_VIDEO_METHOD in the future to select between ffmpeg and other backends + find_package(Threads REQUIRED) add_library(mtmd @@ -63,6 +66,10 @@ target_include_directories(mtmd PRIVATE ../..) target_include_directories(mtmd PRIVATE ../../vendor) target_compile_features (mtmd PRIVATE cxx_std_17) +if (MTMD_VIDEO) + target_compile_definitions(mtmd PRIVATE MTMD_VIDEO) +endif() + if (BUILD_SHARED_LIBS) set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mtmd PRIVATE LLAMA_BUILD) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index bd7f9871c3..a3cad7cd06 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -77,6 +77,7 @@ struct mtmd_cli_context { int n_batch; mtmd::bitmaps bitmaps; + std::vector videos; // chat template common_chat_templates_ptr tmpls; @@ -166,11 +167,14 @@ struct mtmd_cli_context { } bool load_media(const std::string & fname) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false)); - if (!bmp.ptr) { + auto res = mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false); + if (!res.bitmap) { return false; } - bitmaps.entries.push_back(std::move(bmp)); + bitmaps.entries.emplace_back(res.bitmap); + if (res.video_ctx) { + videos.emplace_back(res.video_ctx); + } return true; } }; @@ -253,6 +257,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { } ctx.bitmaps.entries.clear(); + ctx.videos.clear(); llama_pos new_n_past; if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(), @@ -373,6 +378,9 @@ int main(int argc, char ** argv) { if (mtmd_support_audio(ctx.ctx_vision.get())) { LOG("\n /audio load an audio"); } + if (mtmd_helper_support_video(ctx.ctx_vision.get())) { + LOG("\n /video load a video"); + } LOG("\n /clear clear the chat history"); LOG("\n /quit or /exit exit the program"); LOG("\n"); @@ -407,14 +415,15 @@ int main(int argc, char ** argv) { g_is_generating = true; bool is_image = line == "/image" || line.find("/image ") == 0; bool is_audio = line == "/audio" || line.find("/audio ") == 0; - if (is_image || is_audio) { + bool is_video = line == "/video" || line.find("/video ") == 0; + if (is_image || is_audio || is_video) { if (line.size() < 8) { LOG_ERR("ERR: Missing media filename\n"); continue; } std::string media_path = line.substr(7); if (ctx.load_media(media_path)) { - LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio"); + LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : is_audio ? "audio" : "video"); content += mtmd_default_marker(); } // else, error is already printed by libmtmd diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 94ad01511e..18440f06ef 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -36,6 +36,11 @@ #error "mtmd-helper is a public library outside of mtmd. it must not include internal headers" #endif +#ifdef MTMD_VIDEO +#include "sheredom/subprocess.h" +#include +#endif + // // internal logging functions // @@ -79,6 +84,7 @@ struct mtmd_helper_logger { } } g_logger; +#define LOG_DBG(...) g_logger.log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) @@ -478,42 +484,94 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int } // namespace audio_helpers -mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) { +// Computes FNV-1a hash of the data +static std::string fnv_hash(const uint8_t * data, size_t len) { + const uint64_t fnv_prime = 0x100000001b3ULL; + uint64_t hash = 0xcbf29ce484222325ULL; + + for (size_t i = 0; i < len; ++i) { + hash ^= data[i]; + hash *= fnv_prime; + } + return std::to_string(hash); +} + +mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) { + // calculate the hash if needed + std::string id; + mtmd_bitmap * result = nullptr; + + if (!placeholder) { + id = fnv_hash(buf, len); + } + if (audio_helpers::is_audio_file((const char *)buf, len)) { std::vector pcmf32; const int sample_rate = mtmd_get_audio_sample_rate(ctx); if (sample_rate < 0) { LOG_ERR("This model does not support audio input\n"); - return nullptr; + return {nullptr, nullptr}; } if (!audio_helpers::decode_audio_from_buf(buf, len, sample_rate, pcmf32)) { LOG_ERR("Unable to read WAV audio file from buffer\n"); - return nullptr; + return {nullptr, nullptr}; } - return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data()); + result = mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data()); + mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str()); + return {result, nullptr}; } // otherwise, we assume it's an image - mtmd_bitmap * result = nullptr; - { + if (!result) { int nx, ny, nc; auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3); - if (!data) { - LOG_ERR("%s: failed to decode image bytes\n", __func__); - return nullptr; + if (data) { + result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data); + mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str()); + stbi_image_free(data); + return {result, nullptr}; } - result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data); - stbi_image_free(data); + // otherwise, fallthrough to video decoding (if supported) } - return result; + + // last try: load as video +#ifdef MTMD_VIDEO + if (!result) { + auto params = mtmd_helper_video_init_params_default(); + auto video_ctx = mtmd_helper_video_init_from_buf(ctx, buf, len, params); + if (!video_ctx) { + LOG_ERR("%s: failed to decode buffer as either image/audio/video\n", __func__); + return {nullptr, nullptr}; + } + result = mtmd_bitmap_init_lazy(ctx, + id.empty() ? nullptr : id.c_str(), + video_ctx, + [](size_t, void * user_data, mtmd_bitmap ** out_bitmap, char ** out_text) -> int { + auto * vctx = static_cast(user_data); + char * text = nullptr; + int ret = mtmd_helper_video_read_next(vctx, out_bitmap, &text); + *out_text = text; // heap-allocated by read_next; freed automatically by mtmd + return ret; + }); + return {result, video_ctx}; + } +#else + if (!result) { + LOG_ERR("%s: failed to decode buffer as either image or audio (video support not compiled in)\n", __func__); + return {nullptr, nullptr}; + } +#endif + + // should not reach here + return {nullptr, nullptr}; } -mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) { +mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) { std::vector buf; FILE * f = fopen(fname, "rb"); if (!f) { LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); - return nullptr; + return {nullptr, nullptr}; } fseek(f, 0, SEEK_END); @@ -522,7 +580,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * if (file_size < 0) { LOG_ERR("Failed to get file size of %s\n", fname); fclose(f); - return nullptr; + return {nullptr, nullptr}; } buf.resize(file_size); @@ -530,9 +588,425 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fclose(f); if (n_read != (size_t)file_size) { LOG_ERR("Failed to read entire file %s", fname); - return nullptr; + return {nullptr, nullptr}; } return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder); } +bool mtmd_helper_support_video(mtmd_context * ctx) { +#ifdef MTMD_VIDEO + return mtmd_support_vision(ctx); +#else + return false; +#endif +} + +// +// Video input helpers +// + +#ifdef MTMD_VIDEO + +struct mtmd_helper_video { + mtmd_context * mctx; + std::string path; + std::vector input_buf; // non-empty when initialized from buffer + std::string ffmpeg_bin; + std::string ffprobe_bin; + float fps_target = 0.0f; + mtmd_helper_video_info info = {}; + + struct subprocess_s proc = {}; + bool proc_alive = false; + int32_t current_frame = 0; + std::thread feeder_thread; + + std::string prompt_start = "Video:"; + int32_t timestamp_interval_ms = 5000; // emit a timestamp text every N ms (0 = disabled) + float next_timestamp_ms = 0.0f; // next elapsed-ms threshold at which to emit + + std::vector frame_buf; + std::string pending_text; // text queued to be returned before the next frame + bool start_emitted = false; + + bool is_buf_input() const { return !input_buf.empty(); } + + // must run in a separate thread alongside stdout reading to avoid pipe deadlock + void feed_stdin(struct subprocess_s * sp) { + FILE * f = subprocess_stdin(sp); + if (!f) { + LOG_DBG("%s: subprocess has no stdin pipe\n", __func__); + return; + } + LOG_DBG("%s: feeding %zu bytes to stdin\n", __func__, input_buf.size()); + size_t written = fwrite(input_buf.data(), 1, input_buf.size(), f); + LOG_DBG("%s: wrote %zu bytes, closing stdin\n", __func__, written); + fclose(f); + } + + bool probe(float fps_target_arg) { + const char * input_arg = is_buf_input() ? "pipe:0" : path.c_str(); + const char * cmd[] = { + ffprobe_bin.c_str(), + "-v", "quiet", + "-show_entries", "stream=width,height,r_frame_rate,nb_frames,duration", + "-select_streams", "v:0", + "-of", "default=noprint_wrappers=1", + input_arg, + nullptr, + }; + + LOG_DBG("%s: launching:", __func__); + for (size_t i = 0; cmd[i]; i++) { LOG_DBG(" %s", cmd[i]); } + LOG_DBG("\n"); + + struct subprocess_s fprobe; + if (subprocess_create(cmd, + subprocess_option_search_user_path | subprocess_option_inherit_environment, + &fprobe) != 0) { + LOG_ERR("%s: failed to launch ffprobe\n", __func__); + return false; + } + + std::thread probe_feeder; + if (is_buf_input()) { + probe_feeder = std::thread([this, &fprobe]() { feed_stdin(&fprobe); }); + } + + uint32_t width = 0; + uint32_t height = 0; + float orig_fps = 0.0f; + float duration = -1.0f; + int32_t n_frames_orig = -1; + char line[256]; + FILE * fp = subprocess_stdout(&fprobe); + + while (fgets(line, sizeof(line), fp)) { + char * eq = strchr(line, '='); + if (!eq) continue; + *eq = '\0'; + const char * key = line; + const char * val = eq + 1; + char * nl = (char *)strchr(val, '\n'); + if (nl) *nl = '\0'; + + if (strcmp(key, "width") == 0) { + width = (uint32_t)atoi(val); + } else if (strcmp(key, "height") == 0) { + height = (uint32_t)atoi(val); + } else if (strcmp(key, "r_frame_rate") == 0) { + orig_fps = parse_rational(val); + } else if (strcmp(key, "nb_frames") == 0 && strcmp(val, "N/A") != 0) { + n_frames_orig = atoi(val); + } else if (strcmp(key, "duration") == 0 && strcmp(val, "N/A") != 0) { + duration = (float)atof(val); + } + } + + if (probe_feeder.joinable()) { + probe_feeder.join(); + } + + int ret_code; + subprocess_join(&fprobe, &ret_code); + subprocess_destroy(&fprobe); + + if (width == 0 || height == 0 || orig_fps <= 0.0f) { + return false; + } + + if (duration < 0.0f && n_frames_orig > 0) { + duration = (float)n_frames_orig / orig_fps; + } + + fps_target = fps_target_arg > 0.0f ? fps_target_arg : orig_fps; + info.width = width; + info.height = height; + info.fps = fps_target; + LOG_DBG("%s: %ux%u fps=%.2f duration=%.2fs n_frames=%d\n", + __func__, width, height, fps_target, duration, info.n_frames); + info.n_frames = duration > 0.0f ? (int32_t)(duration * fps_target + 0.5f) : -1; + frame_buf.resize((size_t)width * height * 3); + return true; + } + + bool start_ffmpeg(float seek_seconds) { + char seek_buf[64]; + char fps_buf[64]; + + std::vector cmd; + cmd.push_back(ffmpeg_bin.c_str()); + + if (!is_buf_input() && seek_seconds > 0.0f) { + // input-side seek: fast, keyframe-accurate; only valid for seekable file inputs + snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds); + cmd.push_back("-ss"); + cmd.push_back(seek_buf); + } + + cmd.push_back("-i"); + // cache:pipe:0 wraps stdin with a seekable in-memory cache, letting ffmpeg seek + // backwards for container headers (e.g. MP4 moov atom at end of file) + cmd.push_back(is_buf_input() ? "cache:pipe:0" : path.c_str()); + + if (seek_seconds > 0.0f && is_buf_input()) { + // output-side seek: frame-accurate but decodes and discards frames up to seek point + snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds); + cmd.push_back("-ss"); + cmd.push_back(seek_buf); + } + + if (fps_target > 0.0f) { + snprintf(fps_buf, sizeof(fps_buf), "fps=%.6f", fps_target); + cmd.push_back("-vf"); + cmd.push_back(fps_buf); + } + + cmd.push_back("-f"); + cmd.push_back("rawvideo"); + cmd.push_back("-pix_fmt"); + cmd.push_back("rgb24"); + cmd.push_back("pipe:1"); + cmd.push_back("-loglevel"); + cmd.push_back("error"); + cmd.push_back(nullptr); + + LOG_DBG("%s: launching:", __func__); + for (size_t i = 0; cmd[i]; i++) { + LOG_DBG(" %s", cmd[i]); + } + LOG_DBG("\n"); + + int ret = subprocess_create( + cmd.data(), + subprocess_option_search_user_path | subprocess_option_inherit_environment, + &proc); + + proc_alive = (ret == 0); + LOG_DBG("%s: subprocess_create ret=%d proc_alive=%d\n", __func__, ret, (int)proc_alive); + + if (proc_alive && is_buf_input()) { + LOG_DBG("%s: starting feeder thread for %zu-byte buffer\n", __func__, input_buf.size()); + feeder_thread = std::thread([this]() { feed_stdin(&proc); }); + } + + return proc_alive; + } + + void stop_ffmpeg() { + if (proc_alive) { + subprocess_terminate(&proc); + subprocess_destroy(&proc); + proc_alive = false; + } + if (feeder_thread.joinable()) { + feeder_thread.join(); + } + } + + mtmd_bitmap * read_next_frame() { + if (!proc_alive) return nullptr; + + FILE * fp = subprocess_stdout(&proc); + const size_t frame_size = (size_t)info.width * info.height * 3; + LOG_DBG("%s: reading frame %d, expecting %zu bytes (%ux%u)\n", + __func__, current_frame, frame_size, info.width, info.height); + + size_t total_read = 0; + while (total_read < frame_size) { + size_t n = fread(frame_buf.data() + total_read, 1, frame_size - total_read, fp); + if (n == 0) { + // clean EOF only if no bytes read yet; partial frame is an error + LOG_DBG("%s: fread returned 0 after %zu/%zu bytes (ferror=%d)\n", + __func__, total_read, frame_size, ferror(fp)); + proc_alive = false; + return nullptr; + } + total_read += n; + } + + LOG_DBG("%s: frame %d read OK\n", __func__, current_frame); + current_frame++; + return mtmd_bitmap_init(info.width, info.height, frame_buf.data()); + } + + int32_t read_next(mtmd_bitmap ** out_bitmap, char ** out_text) { + *out_bitmap = nullptr; + *out_text = nullptr; + + if (!pending_text.empty()) { + *out_text = strdup(pending_text.c_str()); + pending_text.clear(); + return *out_text ? 0 : -2; + } + + LOG_DBG("%s: proc_alive=%d start_emitted=%d current_frame=%d\n", + __func__, (int)proc_alive, (int)start_emitted, current_frame); + + if (!proc_alive) { + return (current_frame == 0) ? -2 : -1; + } + + if (!start_emitted) { + start_emitted = true; + if (!prompt_start.empty()) { + *out_text = strdup(prompt_start.c_str()); + return *out_text ? 0 : -2; + } + } + + mtmd_bitmap * frame = read_next_frame(); + if (!frame) return -1; + *out_bitmap = frame; + + if (timestamp_interval_ms > 0) { + // current_frame was already incremented by read_next_frame(); undo for elapsed calc + float elapsed_ms = (float)(current_frame - 1) / info.fps * 1000.0f; + if (elapsed_ms >= next_timestamp_ms) { + char ts_buf[32]; + float elapsed_s = elapsed_ms / 1000.0f; + int minutes = (int)(elapsed_s / 60); + float seconds = elapsed_s - minutes * 60.0f; + snprintf(ts_buf, sizeof(ts_buf), "[%dm%.2fs]", minutes, seconds); + pending_text = ts_buf; + next_timestamp_ms += (float)timestamp_interval_ms; + } + } + + return 0; + } + + static float parse_rational(const char * s) { + int num = 0, den = 1; + if (sscanf(s, "%d/%d", &num, &den) == 2 && den > 0) { + return (float)num / (float)den; + } + float val; + if (sscanf(s, "%f", &val) == 1) { + return val; + } + return 0.0f; + } +}; +#endif + +mtmd_helper_video_init_params mtmd_helper_video_init_params_default() { + return { + /* fps_target */ 4.0f, + /* ffmpeg_bin_dir */ nullptr, + /* timestamp_interval_ms */ 5000, + }; +} + +static std::string video_resolve_bin(const char * bin_dir, const char * name) { + if (!bin_dir || bin_dir[0] == '\0') { + return name; // rely on PATH + } + std::string result = bin_dir; + char last = result.back(); + if (last != '/' && last != '\\') { +#ifdef _WIN32 + result += '\\'; +#else + result += '/'; +#endif + } + result += name; +#ifdef _WIN32 + result += ".exe"; +#endif + return result; +} + +mtmd_helper_video * mtmd_helper_video_init( + mtmd_context * mctx, + const char * path, + mtmd_helper_video_init_params params) { +#ifdef MTMD_VIDEO + auto * ctx = new mtmd_helper_video(); + + ctx->mctx = mctx; + ctx->path = path; + ctx->ffmpeg_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg"); + ctx->ffprobe_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe"); + ctx->timestamp_interval_ms = params.timestamp_interval_ms; + + if (!ctx->probe(params.fps_target)) { + LOG_ERR("%s: ffprobe failed for '%s' (is ffprobe in PATH?)\n", __func__, path); + delete ctx; + return nullptr; + } + + if (!ctx->start_ffmpeg(0.0f)) { + LOG_ERR("%s: failed to start ffmpeg for '%s' (is ffmpeg in PATH?)\n", __func__, path); + delete ctx; + return nullptr; + } + + return ctx; +#else + LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__); + return nullptr; +#endif +} + +mtmd_helper_video * mtmd_helper_video_init_from_buf( + mtmd_context * mctx, + const unsigned char * buf, size_t len, + mtmd_helper_video_init_params params) { +#ifdef MTMD_VIDEO + auto * ctx = new mtmd_helper_video(); + + ctx->mctx = mctx; + ctx->input_buf.assign(buf, buf + len); + ctx->ffmpeg_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg"); + ctx->ffprobe_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe"); + ctx->timestamp_interval_ms = params.timestamp_interval_ms; + + if (!ctx->probe(params.fps_target)) { + LOG_ERR("%s: ffprobe failed on buffer (is ffprobe in PATH?)\n", __func__); + delete ctx; + return nullptr; + } + + if (!ctx->start_ffmpeg(0.0f)) { + LOG_ERR("%s: failed to start ffmpeg on buffer (is ffmpeg in PATH?)\n", __func__); + delete ctx; + return nullptr; + } + + return ctx; +#else + LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__); + return nullptr; +#endif +} + +void mtmd_helper_video_free(mtmd_helper_video * ctx) { +#ifdef MTMD_VIDEO + if (!ctx) return; + ctx->stop_ffmpeg(); + delete ctx; +#else + LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__); +#endif +} + +mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx) { +#ifdef MTMD_VIDEO + return ctx->info; +#else + GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)"); +#endif +} + +int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, + mtmd_bitmap ** out_bitmap, char ** out_text) { +#ifdef MTMD_VIDEO + if (!ctx) return -2; + return ctx->read_next(out_bitmap, out_text); +#else + GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)"); +#endif +} diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 7eecbb0672..164b7c6689 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -20,25 +20,39 @@ extern "C" { // BREAKING CHANGES are expected. // +struct mtmd_helper_video; +typedef struct mtmd_helper_video mtmd_helper_video; + // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. // Note: this also call mtmd_log_set() internally MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data); +// Returns true if this build includes video support (MTMD_VIDEO was ON at compile time). +MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx); + +struct mtmd_helper_bitmap_wrapper { + mtmd_bitmap * bitmap; + mtmd_helper_video * video_ctx; +}; + // helper function to construct a mtmd_bitmap from a file // it calls mtmd_helper_bitmap_init_from_buf() internally // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); +MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); // helper function to construct a mtmd_bitmap from a buffer containing a file // supported formats: // image: formats supported by stb_image: jpg, png, bmp, gif, etc. // audio: formats supported by miniaudio: wav, mp3, flac -// note: audio files will be auto-detected based on magic bytes +// note: +// - for now, video input is only supported via C++ helper functions +// - audio files will be auto-detected based on magic bytes +// - output bitmap will have FNV hash as the ID // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); +MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); @@ -89,6 +103,56 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, int32_t n_batch, llama_pos * new_n_past); +// +// video input helpers (requires ffmpeg/ffprobe installed on the system) +// the notion of video only exists at the helper level, it is not visible to the core mtmd library +// +// NOTE: this implementation is model-agnostic, it can be used with any vision-capable model +// however, it may not be accurate for some specific models +// (this is expected for now, to keep the implementation simple) +// + +struct mtmd_helper_video_info { + uint32_t width; + uint32_t height; + float fps; // effective fps (fps_target if set, else original video fps) + int32_t n_frames; // estimated total frames at effective fps (-1 if unknown) +}; + +struct mtmd_helper_video_init_params { + float fps_target; // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f + const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH + int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms + // TODO @ngxson : allow "placeholder" bitmap output for counting tokens +}; + +MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void); + +// returns NULL on failure (ffprobe not found, file unreadable, etc.) +MTMD_API mtmd_helper_video * mtmd_helper_video_init( + struct mtmd_context * mctx, + const char * path, + struct mtmd_helper_video_init_params params); + +// Same as mtmd_helper_video_init(), but reads from an in-memory buffer. +// The buffer is copied internally; the caller does not need to keep it alive. +// Note: pipe input is not seekable, so seeking will use output-side seeking +// (ffmpeg decodes and discards frames up to the target position). +MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf( + struct mtmd_context * mctx, + const unsigned char * buf, size_t len, + struct mtmd_helper_video_init_params params); +MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx); +MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx); + +// Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call. +// *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free() +// *out_text - heap-allocated (always via strdup/malloc); caller must free with free() +// returns 0 on success, -1 on EOF, -2 on error +MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, + mtmd_bitmap ** out_bitmap, + char ** out_text); + #ifdef __cplusplus } // extern "C" #endif @@ -97,4 +161,16 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, // C++ wrappers // +#ifdef __cplusplus +namespace mtmd_helper { + +// video-related C++ wrappers +struct mtmd_helper_video_deleter { + void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); } +}; +using video_ptr = std::unique_ptr; + +} // namespace mtmd_helper +#endif + #endif diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index c93fb1e0a4..4140a3c4aa 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -35,6 +35,10 @@ struct mtmd_bitmap { std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking bool is_audio = false; // true if the bitmap is audio + // lazy-loaded bitmap + mtmd_bitmap_lazy_callback lazy_callback = nullptr; + void * lazy_user_data = nullptr; + mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny) : nx(nx), ny(ny), is_audio(false) { if (data) { @@ -732,30 +736,111 @@ void mtmd_free(mtmd_context * ctx) { struct mtmd_tokenizer { mtmd_context * ctx; - std::vector bitmaps; std::string input_text; bool add_special; bool parse_special; const llama_vocab * vocab; + struct part { + std::string text; + const mtmd_bitmap * bitmap; + }; + std::vector parts; + // these will be freed when mtmd_tokenizer finishes + std::vector bm_from_lazy; // TODO @ngxson : refactor, free bm_from_lazy progressively + std::vector text_from_lazy; + mtmd_input_chunks cur; uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk + ~mtmd_tokenizer() { + // note: mtmd::bitmap is already RAII + for (auto & str : text_from_lazy) { + free((void *)str); + } + } + mtmd_tokenizer(mtmd_context * ctx, const mtmd_input_text * text, - const mtmd_bitmap ** bitmaps, - size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) { + const mtmd_bitmap ** bmps, + size_t n_bitmaps) : ctx(ctx) { add_special = text->add_special; parse_special = text->parse_special; input_text = text->text; vocab = ctx->vocab; + + std::vector bitmaps(bmps, bmps + n_bitmaps); + auto parts_str = split_text(input_text, ctx->media_marker); + size_t i_bm = 0; + for (const auto & part : parts_str) { + if (part == ctx->media_marker) { + if (i_bm >= bitmaps.size()) { + throw std::runtime_error(string_format("number of media markers in text (%zu) exceeds number of bitmaps (%zu)", i_bm + 1, bitmaps.size())); + } + parts.push_back({"", bitmaps[i_bm++]}); + } else { + parts.push_back({std::move(part), nullptr}); + } + } + + size_t n_markers = 0; + for (const auto & part : parts) { + if (part.bitmap != nullptr) { + n_markers++; + } + } + if (n_markers != bitmaps.size()) { + throw std::runtime_error(string_format("number of media markers in text (%zu) does not match number of bitmaps (%zu)", n_markers, bitmaps.size())); + } + + expand_lazy_bitmaps(); + } + + void expand_lazy_bitmaps() { + std::vector expanded; + expanded.reserve(parts.size()); + for (auto & p : parts) { + if (p.bitmap != nullptr && p.bitmap->lazy_callback) { + LOG_DBG("%s: expanding lazy bitmap\n", __func__); + for (size_t i = 0;; i++) { + char * out_str = nullptr; + mtmd_bitmap * out_bm = nullptr; + int res = p.bitmap->lazy_callback(i, + p.bitmap->lazy_user_data, + &out_bm, + &out_str); + if (out_bm && out_str) { + throw std::runtime_error(string_format("lazy callback cannot return both bitmap and text")); + } + if (res == 0) { + // OK, append the returned chunk; lazy part is not yet added + if (out_bm) { + auto & ptr = bm_from_lazy.emplace_back(out_bm); // remember to free it later + expanded.push_back({"", ptr.ptr.get()}); + LOG_DBG("%s: lazy callback returned bitmap with dimensions %d x %d\n", __func__, out_bm->nx, out_bm->ny); + } else if (out_str) { + auto & ptr = text_from_lazy.emplace_back(out_str); // remember to free it later + expanded.push_back({ptr, nullptr}); + LOG_DBG("%s: lazy callback returned text: %s\n", __func__, out_str); + } + } else if (res == -1) { + // EOF: lazy part removes itself (not added to expanded) + break; + } else if (res == -2) { + // error + throw std::runtime_error(string_format("lazy callback returned error")); + } + } + } else { + expanded.push_back(std::move(p)); + } + } + parts = std::move(expanded); } int32_t tokenize(mtmd_input_chunks * output) { cur.entries.clear(); - std::vector parts = split_text(input_text, ctx->media_marker); - size_t i_bm = 0; // index of the current bitmap // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl) int n_merge_frames = 1; @@ -764,53 +849,50 @@ struct mtmd_tokenizer { GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more"); } + // Build merged_bitmaps: each entry is a group of 1 or 2 bitmaps. + // For consecutive mergeable bitmap parts, merge them and collapse the second part out of this->parts. std::vector> merged_bitmaps; if (n_merge_frames > 1) { - size_t i_bm_scan = 0; for (size_t i = 0; i < parts.size(); ++i) { - if (parts[i] != ctx->media_marker) { + if (parts[i].bitmap == nullptr) { continue; } - if (i + 1 < parts.size() - && parts[i + 1] == ctx->media_marker - && i_bm_scan + 1 < bitmaps.size()) { - const mtmd_bitmap * bm_a = bitmaps[i_bm_scan]; - const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1]; + if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) { + const mtmd_bitmap * bm_a = parts[i].bitmap; + const mtmd_bitmap * bm_b = parts[i + 1].bitmap; if (bm_a->can_batch_with(*bm_b)) { - LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1); + LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1); merged_bitmaps.push_back({bm_a, bm_b}); - parts.erase(parts.begin() + i + 1); // remove the second marker - i_bm_scan += 2; + parts.erase(parts.begin() + i + 1); // collapse the second bitmap part continue; } } - LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan); - merged_bitmaps.push_back({bitmaps[i_bm_scan]}); - ++i_bm_scan; + LOG_DBG("%s: no merging for part index %zu\n", __func__, i); + merged_bitmaps.push_back({parts[i].bitmap}); } } else { - for (size_t i = 0; i < bitmaps.size(); ++i) { - merged_bitmaps.push_back({bitmaps[i]}); + for (const auto & p : parts) { + if (p.bitmap != nullptr) { + merged_bitmaps.push_back({p.bitmap}); + } } } - i_bm = 0; - for (auto & part : parts) { - if (part == ctx->media_marker) { - // this is a marker, we should add the next bitmap + size_t i_bm = 0; + for (const auto & p : parts) { + if (p.bitmap != nullptr) { if (i_bm >= merged_bitmaps.size()) { LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n", __func__, merged_bitmaps.size(), parts.size() - 1); return 1; } - auto & bmps = merged_bitmaps[i_bm++]; + auto bmps = merged_bitmaps[i_bm++]; int32_t res = add_media(bmps); if (res != 0) { return res; } } else { - // this is a text part, we should add it as text - add_text(part, parse_special); + add_text(p.text, parse_special); } } @@ -1236,8 +1318,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx, const mtmd_input_text * text, const mtmd_bitmap ** bitmaps, size_t n_bitmaps) { - mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps); - return tokenizer.tokenize(output); + try { + mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps); + return tokenizer.tokenize(output); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return 2; + } } int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { @@ -1373,6 +1460,10 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) { return clip_get_hparams(ctx->ctx_a)->audio_sample_rate; } +const char * mtmd_get_marker(const mtmd_context * ctx) { + return ctx->media_marker.c_str(); +} + // // public API functions // @@ -1405,10 +1496,16 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) { } const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) { + if (bitmap->is_placeholder()) { + return nullptr; + } return bitmap->get_ro_buf().data(); } size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { + if (bitmap->is_placeholder()) { + return 0; + } return bitmap->get_ro_buf().size(); } @@ -1428,6 +1525,18 @@ void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) { } } +mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, + const char * id, + void * user_data, + mtmd_bitmap_lazy_callback callback) { + GGML_UNUSED(ctx); // reserved for future use + mtmd_bitmap * bitmap = new mtmd_bitmap(nullptr, 0, 0); + bitmap->lazy_callback = callback; + bitmap->lazy_user_data = user_data; + mtmd_bitmap_set_id(bitmap, id); + return bitmap; +} + void mtmd_bitmap_free(mtmd_bitmap * bitmap) { if (bitmap) { delete bitmap; diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 128fb18261..a76a6ec2b8 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -128,6 +128,9 @@ MTMD_API bool mtmd_support_audio(const mtmd_context * ctx); // return -1 if audio is not supported MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); +// get the current marker string +MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx); + // mtmd_bitmap // // if bitmap is image: @@ -156,6 +159,34 @@ MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id); +// mtmd_bitmap lazy +// +// this is a special bitmap that: +// - does not hold the actual data +// - can be expanded into one or more chunks (either media to text chunks) +// user must provide a callback to fill in the data when mtmd_tokenize() is called +// this is useful for large video inputs: +// - allow reading video frame by frame, without loading the entire video into memory +// - allow tracking the whole video with a single ID (for example, the file hash) + +// set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically +// set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically +// either out_bitmap or out_text can be set, but not both +// out_bitmap cannot be another lazy bitmap (no nested lazy allowed) +// return value: +// 0 on success +// -1 on EOF (signal to mtmd_tokenize to move on) +// -2 on error (signal to mtmd_tokenize to abort) +typedef int(* mtmd_bitmap_lazy_callback)( + size_t chunk_idx, + void * user_data, + mtmd_bitmap ** out_bitmap, + char ** out_text); + +MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx, + const char * id, // usually set to file hash + void * user_data, + mtmd_bitmap_lazy_callback callback); // mtmd_input_chunks // diff --git a/tools/mtmd/test-3.mp4 b/tools/mtmd/test-3.mp4 new file mode 100644 index 0000000000..fedf3975be Binary files /dev/null and b/tools/mtmd/test-3.mp4 differ diff --git a/tools/server/README.md b/tools/server/README.md index bf056dc60b..f507b8c181 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1252,6 +1252,10 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type": `parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template). +For multimodal input: +- Content type `image_url` and `input_audio` are the same as OAI schema +- Content type `input_video` is an extension from OAI schema. For now, it only accepts base64 input + *Examples:* You can use either Python `openai` library with appropriate checkpoints: diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index dfd286d24e..9f3caac8f7 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -701,29 +701,19 @@ size_t validate_utf8(const std::string& text) { return len; } -// Computes FNV-1a hash of the data -static std::string fnv_hash(const uint8_t * data, size_t len) { - const uint64_t fnv_prime = 0x100000001b3ULL; - uint64_t hash = 0xcbf29ce484222325ULL; - - for (size_t i = 0; i < len; ++i) { - hash ^= data[i]; - hash *= fnv_prime; - } - return std::to_string(hash); -} - server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector & files, bool is_placeholder) { + // these will be freed upon going out of scope mtmd::bitmaps bitmaps; + std::vector videos; for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder)); - if (!bmp.ptr) { + auto out = mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder); + if (!out.bitmap) { throw std::runtime_error("Failed to load image or audio file"); } - // calculate bitmap hash (for KV caching) - std::string hash = fnv_hash(bmp.data(), bmp.n_bytes()); - bmp.set_id(hash.c_str()); - bitmaps.entries.push_back(std::move(bmp)); + bitmaps.entries.emplace_back(out.bitmap); + if (out.video_ctx) { + videos.emplace_back(out.video_ctx); + } } // process prompt std::vector inputs; @@ -1023,6 +1013,20 @@ json oaicompat_chat_params_parse( p["text"] = get_media_marker(); p.erase("input_audio"); + } else if (type == "input_video") { + if (!opt.allow_video) { + throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); + } + + json input_video = json_value(p, "input_video", json::object()); + std::string data = json_value(input_video, "data", std::string()); + auto decoded_data = base64_decode(data); // expected to be base64 encoded + out_files.push_back(decoded_data); + + p["type"] = "media_marker"; + p["text"] = get_media_marker(); + p.erase("input_video"); + } else if (type != "text") { throw std::invalid_argument("unsupported content[].type"); } diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 51b1613178..249b97c2fa 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -294,6 +294,7 @@ struct server_chat_params { common_chat_templates_ptr tmpls; bool allow_image; bool allow_audio; + bool allow_video; bool enable_thinking = true; int reasoning_budget = -1; std::string reasoning_budget_message; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 07759f4170..6fa302e132 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1247,6 +1247,7 @@ private: /* tmpls */ std::move(chat_templates), /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, + /* allow_video */ mctx ? mtmd_helper_support_video(mctx) : false, /* enable_thinking */ enable_thinking, /* reasoning_budget */ params_base.sampling.reasoning_budget_tokens, /* reasoning_budget_msg */ params_base.sampling.reasoning_budget_message, @@ -3586,6 +3587,7 @@ server_context_meta server_context::get_meta() const { /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, /* has_inp_audio */ impl->chat_params.allow_audio, + /* has_inp_video */ impl->chat_params.allow_video, /* json_ui_settings */ impl->json_ui_settings, /* json_webui_settings */ impl->json_webui_settings, // Deprecated /* slot_n_ctx */ impl->get_slot_n_ctx(), @@ -4183,6 +4185,7 @@ void server_routes::init_routes() { { "model_path", meta->model_path }, { "modalities", json { {"vision", meta->has_inp_image}, + {"video", meta->has_inp_video}, {"audio", meta->has_inp_audio}, } }, { "media_marker", get_media_marker() }, @@ -4976,7 +4979,7 @@ std::unique_ptr server_routes::handle_count_tokens(const l n_tokens = tokenize_mixed(vocab, prompt, true, true).size(); } - json response = {{"input_tokens", static_cast(n_tokens)}}; + json response = {{"input_tokens", static_cast(n_tokens)}}; if (is_oai) { response["object"] = "response.input_tokens"; } diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 72a1f40e01..0e84785af4 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -21,6 +21,7 @@ struct server_context_meta { bool has_mtmd; bool has_inp_image; bool has_inp_audio; + bool has_inp_video; json json_ui_settings; // Primary: new name json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat) int slot_n_ctx;