whisper.cpp

Running

App Files Files Community

whisper.cpp / examples /command.wasm /emscripten.cpp

jhenhong

whisper : add context param to disable gpu (#1293)

290abed unverified about 2 years ago

raw

history blame contribute delete

10.5 kB

	#include "ggml.h"
	#include "common.h"
	#include "whisper.h"

	#include <emscripten.h>
	#include <emscripten/bind.h>

	#include <atomic>
	#include <cmath>
	#include <mutex>
	#include <string>
	#include <thread>
	#include <vector>
	#include <regex>

	constexpr int N_THREAD = 8;

	std::vector<struct whisper_context *> g_contexts(4, nullptr);

	std::mutex g_mutex;
	std::thread g_worker;

	std::atomic<bool> g_running(false);

	std::string g_status = "";
	std::string g_status_forced = "";
	std::string g_transcribed = "";

	std::vector<float> g_pcmf32;

	void command_set_status(const std::string & status) {
	std::lock_guard<std::mutex> lock(g_mutex);
	g_status = status;
	}

	std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
	const auto t_start = std::chrono::high_resolution_clock::now();

	prob = 0.0f;
	t_ms = 0;

	if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
	return "";
	}

	int prob_n = 0;
	std::string result;

	const int n_segments = whisper_full_n_segments(ctx);
	for (int i = 0; i < n_segments; ++i) {
	const char * text = whisper_full_get_segment_text(ctx, i);

	result += text;

	const int n_tokens = whisper_full_n_tokens(ctx, i);
	for (int j = 0; j < n_tokens; ++j) {
	const auto token = whisper_full_get_token_data(ctx, i, j);

	prob += token.p;
	++prob_n;
	}
	}

	if (prob_n > 0) {
	prob /= prob_n;
	}

	const auto t_end = std::chrono::high_resolution_clock::now();
	t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();

	return result;
	}

	void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
	const int64_t n_samples = (ms * sample_rate) / 1000;

	int64_t n_take = 0;
	if (n_samples > (int) g_pcmf32.size()) {
	n_take = g_pcmf32.size();
	} else {
	n_take = n_samples;
	}

	audio.resize(n_take);
	std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
	}

	void command_main(size_t index) {
	command_set_status("loading data ...");

	struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);

	wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
	wparams.offset_ms = 0;
	wparams.translate = false;
	wparams.no_context = true;
	wparams.single_segment = true;
	wparams.print_realtime = false;
	wparams.print_progress = false;
	wparams.print_timestamps = true;
	wparams.print_special = false;

	wparams.max_tokens = 32;
	wparams.audio_ctx = 768; // partial encoder context for better performance

	wparams.language = "en";

	printf("command: using %d threads\n", wparams.n_threads);

	bool have_prompt = false;
	bool ask_prompt = true;
	bool print_energy = false;

	float prob0 = 0.0f;
	float prob = 0.0f;

	std::vector<float> pcmf32_cur;
	std::vector<float> pcmf32_prompt;

	const std::string k_prompt = "Ok Whisper, start listening for commands.";

	// whisper context
	auto & ctx = g_contexts[index];

	const int32_t vad_ms = 2000;
	const int32_t prompt_ms = 5000;
	const int32_t command_ms = 4000;

	const float vad_thold = 0.1f;
	const float freq_thold = -1.0f;

	while (g_running) {
	// delay
	std::this_thread::sleep_for(std::chrono::milliseconds(100));

	if (ask_prompt) {
	fprintf(stdout, "\n");
	fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
	fprintf(stdout, "\n");

	{
	char txt[1024];
	snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
	command_set_status(txt);
	}

	ask_prompt = false;
	}

	int64_t t_ms = 0;

	{
	command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);

	if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
	fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
	command_set_status("Speech detected! Processing ...");

	if (!have_prompt) {
	command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);

	const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));

	fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);

	const float sim = similarity(txt, k_prompt);

	if (txt.length() < 0.8k_prompt.length() \|\| txt.length() > 1.2k_prompt.length() \|\| sim < 0.8f) {
	fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
	ask_prompt = true;
	} else {
	fprintf(stdout, "\n");
	fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
	fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
	fprintf(stdout, "\n");

	{
	char txt[1024];
	snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
	command_set_status(txt);
	}

	// save the audio for the prompt
	pcmf32_prompt = pcmf32_cur;
	have_prompt = true;
	}
	} else {
	command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);

	// prepend the prompt audio
	pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());

	const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));

	prob = 100.0f*(prob - prob0);

	fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());

	// find the prompt in the text
	float best_sim = 0.0f;
	size_t best_len = 0;
	for (int n = 0.8k_prompt.size(); n <= 1.2k_prompt.size(); ++n) {
	const auto prompt = txt.substr(0, n);

	const float sim = similarity(prompt, k_prompt);

	//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);

	if (sim > best_sim) {
	best_sim = sim;
	best_len = n;
	}
	}

	const std::string command = ::trim(txt.substr(best_len));

	fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
	fprintf(stdout, "\n");

	{
	char txt[1024];
	snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
	command_set_status(txt);
	}
	{
	std::lock_guard<std::mutex> lock(g_mutex);
	g_transcribed = command;
	}
	}

	g_pcmf32.clear();
	}
	}
	}

	if (index < g_contexts.size()) {
	whisper_free(g_contexts[index]);
	g_contexts[index] = nullptr;
	}
	}

	EMSCRIPTEN_BINDINGS(command) {
	emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
	for (size_t i = 0; i < g_contexts.size(); ++i) {
	if (g_contexts[i] == nullptr) {
	g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
	if (g_contexts[i] != nullptr) {
	g_running = true;
	if (g_worker.joinable()) {
	g_worker.join();
	}
	g_worker = std::thread([i]() {
	command_main(i);
	});

	return i + 1;
	} else {
	return (size_t) 0;
	}
	}
	}

	return (size_t) 0;
	}));

	emscripten::function("free", emscripten::optional_override([](size_t index) {
	if (g_running) {
	g_running = false;
	}
	}));

	emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
	--index;

	if (index >= g_contexts.size()) {
	return -1;
	}

	if (g_contexts[index] == nullptr) {
	return -2;
	}

	{
	std::lock_guard<std::mutex> lock(g_mutex);
	const int n = audio["length"].as<int>();

	emscripten::val heap = emscripten::val::module_property("HEAPU8");
	emscripten::val memory = heap["buffer"];

	g_pcmf32.resize(n);

	emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
	memoryView.call<void>("set", audio);
	}

	return 0;
	}));

	emscripten::function("get_transcribed", emscripten::optional_override([]() {
	std::string transcribed;

	{
	std::lock_guard<std::mutex> lock(g_mutex);
	transcribed = std::move(g_transcribed);
	}

	return transcribed;
	}));

	emscripten::function("get_status", emscripten::optional_override([]() {
	std::string status;

	{
	std::lock_guard<std::mutex> lock(g_mutex);
	status = g_status_forced.empty() ? g_status : g_status_forced;
	}

	return status;
	}));

	emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
	{
	std::lock_guard<std::mutex> lock(g_mutex);
	g_status_forced = status;
	}
	}));
	}