[Scummvm-git-logs] scummvm master -> c8102c4c6f7fa4450972544bc35314eb2639c0de

Tue May 20 09:05:27 UTC 2025

This automated email contains information about 1 new commit which have been
pushed to the 'scummvm' repo located at https://api.github.com/repos/scummvm/scummvm .

Summary:
c8102c4c6f EMSCRIPTEN: TTS: Add TTS based on Web Speech API


Commit: c8102c4c6f7fa4450972544bc35314eb2639c0de
    https://github.com/scummvm/scummvm/commit/c8102c4c6f7fa4450972544bc35314eb2639c0de
Author: Christian KÃ¼ndig (christian at kuendig.info)
Date: 2025-05-20T12:05:23+03:00

Commit Message:
EMSCRIPTEN: TTS: Add TTS based on Web Speech API

Changed paths:
  A backends/text-to-speech/emscripten/emscripten-text-to-speech.cpp
  A backends/text-to-speech/emscripten/emscripten-text-to-speech.h
    backends/module.mk
    backends/platform/sdl/emscripten/emscripten.cpp
    backends/platform/sdl/emscripten/emscripten.h
    common/text-to-speech.h
    configure

diff --git a/backends/module.mk b/backends/module.mk
index 7d3c71cc11b..2d3a0f26fac 100644
--- a/backends/module.mk
+++ b/backends/module.mk
@@ -93,6 +93,12 @@ MODULE_OBJS += \
 	networking/curl/url.o
 endif
 
+ifdef EMSCRIPTEN
+ifdef USE_TTS
+MODULE_OBJS += \
+	text-to-speech/emscripten/emscripten-text-to-speech.o
+endif
+endif
 ifdef USE_SDL_NET
 MODULE_OBJS += \
 	networking/sdl_net/client.o \
diff --git a/backends/platform/sdl/emscripten/emscripten.cpp b/backends/platform/sdl/emscripten/emscripten.cpp
index 705ce128be2..8f545246f53 100644
--- a/backends/platform/sdl/emscripten/emscripten.cpp
+++ b/backends/platform/sdl/emscripten/emscripten.cpp
@@ -65,6 +65,16 @@ EM_JS(void, downloadFile, (const char *filenamePtr, char *dataPtr, int dataSize)
 });
 
 // Overridden functions
+
+void OSystem_Emscripten::initBackend() {
+#ifdef USE_TTS
+	// Initialize Text to Speech manager
+	_textToSpeechManager = new EmscriptenTextToSpeechManager();
+#endif
+	// Invoke parent implementation of this method
+	OSystem_POSIX::initBackend();
+}
+
 bool OSystem_Emscripten::hasFeature(Feature f) {
 	if (f == kFeatureFullscreenMode)
 		return true;
diff --git a/backends/platform/sdl/emscripten/emscripten.h b/backends/platform/sdl/emscripten/emscripten.h
index 4524067c5f6..9964ea2d010 100644
--- a/backends/platform/sdl/emscripten/emscripten.h
+++ b/backends/platform/sdl/emscripten/emscripten.h
@@ -26,6 +26,7 @@
 
 class OSystem_Emscripten : public OSystem_POSIX {
 public:
+	void initBackend() override;
 	bool hasFeature(Feature f) override;
 	void setFeatureState(Feature f, bool enable) override;
 	bool getFeatureState(Feature f) override;
diff --git a/backends/text-to-speech/emscripten/emscripten-text-to-speech.cpp b/backends/text-to-speech/emscripten/emscripten-text-to-speech.cpp
new file mode 100644
index 00000000000..8c8b2b66d05
--- /dev/null
+++ b/backends/text-to-speech/emscripten/emscripten-text-to-speech.cpp
@@ -0,0 +1,360 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+// Disable symbol overrides so that we can use system headers.
+#define FORBIDDEN_SYMBOL_EXCEPTION_FILE
+#define FORBIDDEN_SYMBOL_EXCEPTION_getenv
+
+#include "backends/text-to-speech/emscripten/emscripten-text-to-speech.h"
+
+#if defined(USE_TTS) && defined(EMSCRIPTEN)
+#include <emscripten.h>
+
+#include "common/config-manager.h"
+#include "common/system.h"
+#include "common/translation.h"
+#include "common/ustr.h"
+#include "common/debug.h"
+
+EM_JS(void, ttsInit, (), {
+	/* 
+	* Voices can come from the browser, the operating system or cloud services. This means we sometimes get
+	* an incomplete or empty list on first call getVoices().
+	* Best practice is to listen to the 'voiceschanged' event and update the list of voices when it fires.
+	*/
+	globalThis['ttsVoiceMap'] = {};
+	globalThis['ttsUtteranceQueue'] = [];
+	const refreshVoices = () => {
+		globalThis['ttsVoiceMap'] = {};
+		var cnt = 0;
+		voices = window.speechSynthesis.getVoices();
+		Array.from(voices).forEach((voice) => {
+			if (!(voice.lang in globalThis['ttsVoiceMap'])) {
+				globalThis['ttsVoiceMap'][voice.lang] = {};
+			}
+			globalThis['ttsVoiceMap'][voice.lang][voice.name] = voice;
+			cnt++;
+		});
+		console.log("Found %d voices",cnt);
+	};
+
+	if ('onvoiceschanged' in speechSynthesis) {
+		speechSynthesis.onvoiceschanged = refreshVoices;
+	} 
+	refreshVoices();
+});
+
+EmscriptenTextToSpeechManager::EmscriptenTextToSpeechManager() {
+	ttsInit();
+#ifdef USE_TRANSLATION
+	setLanguage(TransMan.getCurrentLanguage());
+#else
+	setLanguage("en");
+#endif
+}
+
+EmscriptenTextToSpeechManager::~EmscriptenTextToSpeechManager() {
+	stop();
+}
+
+
+EM_JS(bool, _ttsSay, (const char *text, const char *voice_name, const char *voice_lang, int pitch, int rate, int volume, int action), {
+	voice_name = UTF8ToString(voice_name);
+	voice_lang = UTF8ToString(voice_lang);
+	if (!(voice_lang in globalThis['ttsVoiceMap'] && voice_name in globalThis['ttsVoiceMap'][voice_lang])){
+		console.error("_ttsSay: Voice not found");
+		return false;
+	}
+	text = UTF8ToString(text);
+	if (text === "") {
+		return false;
+	}
+	/* 
+	* Possible actions are:
+	*	INTERRUPT - interrupts the current speech
+	*	INTERRUPT_NO_REPEAT - interrupts the speech (deletes the whole queue),
+	*		if the str is the same as the string currently being said,
+	*		it lets the current string finish.
+	*	QUEUE - queues the speech
+	*	QUEUE_NO_REPEAT - queues the speech only if the str is different than
+	*		the last string in the queue (or the string, that is currently
+	*		being said if the queue is empty)
+	*	DROP - does nothing if there is anything being said at the moment
+	*/
+	const Actions = Object.freeze({
+		INTERRUPT: 0,
+		INTERRUPT_NO_REPEAT: 1,
+		QUEUE: 2,
+		QUEUE_NO_REPEAT: 3
+	});
+	console.assert(action <= 3,"_ttsSay: Illegal Action: %d",action);// DROP is handled on the native side so we should only have 0-3.
+
+	if (action == Actions.QUEUE_NO_REPEAT && 
+			globalThis['ttsUtteranceQueue'].length > 0 && globalThis['ttsUtteranceQueue'][globalThis['ttsUtteranceQueue'].length-1].text == text) {
+		console.debug("_ttsSay: Skipping duplicate utterance (QUEUE_NO_REPEAT)");
+		return false;
+	}
+	//  INTERRUPT_NO_REPEAT with a matching string - empty queue but let the current string finish
+	if (action == Actions.INTERRUPT_NO_REPEAT && globalThis['ttsUtteranceQueue'].length > 0 && globalThis['ttsUtteranceQueue'][0].text == text){
+		globalThis['ttsUtteranceQueue'] = globalThis['ttsUtteranceQueue'].slice(0,1);
+		return false;
+	}
+	// interrupt or INTERRUPT_NO_REPEAT with a non-matching string (or no string talking) - empty queue, cancel all talking
+	if (action == Actions.INTERRUPT || action == Actions.INTERRUPT_NO_REPEAT ) {
+		globalThis['ttsUtteranceQueue'] = [];//globalThis['ttsUtteranceQueue'].slice(0,1);
+		window.speechSynthesis.cancel();
+		
+	}
+	// queue and speak next utterance
+	voice = globalThis['ttsVoiceMap'][voice_lang][voice_name];
+	const utterance = new SpeechSynthesisUtterance(text);
+	utterance.onend = function(event) { // this is triggered when an utterance completes speaking 
+		if (globalThis['ttsUtteranceQueue'][0] == event.target){
+			globalThis['ttsUtteranceQueue'].shift(); //remove utterance that was just spoken
+		}
+		if (globalThis['ttsUtteranceQueue'].length > 0 && !window.speechSynthesis.speaking){ // speak next utterance if nothing is being spoken
+			window.speechSynthesis.speak(globalThis['ttsUtteranceQueue'][0]);
+		}
+	};
+	utterance.onerror = function(event) { // this includes canceled utterances (so not just errors)
+		if (globalThis['ttsUtteranceQueue'][0] == event.target){
+			globalThis['ttsUtteranceQueue'].shift(); //remove utterance that was just spoken
+		}
+		if (globalThis['ttsUtteranceQueue'].length > 0 && !window.speechSynthesis.speaking){ // speak next utterance if nothing is being spoken
+			window.speechSynthesis.speak(globalThis['ttsUtteranceQueue'][0]);
+		}
+	};
+	/* 
+	 * TODO: we could do INTERRUPT_NO_REPEAT and INTERRUPT handling on boundaries, but it's not reliable
+	 * 		 remote voices don't have onboundary event: https://issues.chromium.org/issues/41195426
+     * 
+	 * 	utterance.onboundary = function(event){
+	 *	    console.log(event);
+	 *	};
+	*/
+	utterance.voice = voice;
+	utterance.volume = volume / 100; // linearly adjust 0 to 100 -> 0 to 1
+	utterance.pitch = (pitch + 100) / 100; // linearly adjust -100 to 100 (0 default) -> 0 to 2 (1 default)
+	utterance.rate = rate > 0 ? 1 + (rate / (100 - 9)) : 0.1 + (rate + 100) / (100 / 0.9); // linearly adjust -100 to 100 (0 default)  -> 0.1 to 10 (1 default)
+	
+	console.debug("Pushing to queue: %s",text);
+	globalThis['ttsUtteranceQueue'].push(utterance);
+	if (globalThis['ttsUtteranceQueue'].length == 1){
+		console.debug("Speaking %s",text);
+		window.speechSynthesis.speak(utterance);
+	}
+	return true;
+});
+
+bool EmscriptenTextToSpeechManager::say(const Common::U32String &str, Action action) {
+	assert(_ttsState->_enabled);
+
+	Common::String strUtf8 = str.encode();
+	debug(5, "Saying %s (%d)", strUtf8.c_str(), action);
+
+	if (isSpeaking() && action == DROP) {
+		debug(5, "EmscriptenTextToSpeechManager::say - Not saying '%s' as action=DROP and already speaking", strUtf8.c_str());
+		return true;
+	}
+
+	char *voice_name = ((char **)_ttsState->_availableVoices[_ttsState->_activeVoice].getData())[0];
+	char *voice_lang = ((char **)_ttsState->_availableVoices[_ttsState->_activeVoice].getData())[1];
+	return _ttsSay(strUtf8.c_str(), voice_name, voice_lang, _ttsState->_pitch, _ttsState->_rate, _ttsState->_volume, action);
+}
+
+EM_JS(char **, _ttsGetVoices, (), {
+	voices = Array.from(Object.values(globalThis['ttsVoiceMap'])).map(Object.values).flat() // flatten voice map
+		.sort((a,b) => a.default ===  b.default ? a.name.localeCompare(b.name):a.default?-1:1) // first default, then alphabetically
+		.map(voice=>[voice.name,voice.lang])
+		.flat();
+	voices.push(""); // we need this to find the end of the array on the native side.
+
+	// convert the strings to C strings
+	var c_strings = voices.map((s) => {
+		var size = lengthBytesUTF8(s) + 1;
+		var ret = Module._malloc(size);
+		stringToUTF8Array(s, HEAP8, ret, size);
+		return ret;
+	});
+
+	var ret_arr = Module._malloc(c_strings.length * 4); // 4-bytes per pointer
+	c_strings.forEach((ptr, i) => { Module.setValue(ret_arr + i * 4, ptr, "i32"); }); // populate return array
+	return ret_arr;
+});
+
+void EmscriptenTextToSpeechManager::updateVoices() {
+	_ttsState->_availableVoices.clear();
+	char **ttsVoices = _ttsGetVoices();
+	char **iter = ttsVoices;
+	Common::Array<char *> names;
+	while (strcmp(*iter, "") != 0) {
+		char *c_name = *iter++;
+		char *c_lang = *iter++;
+		Common::String language = Common::String(c_lang);
+		if (_ttsState->_language == language.substr(0, 2)) {
+			int idx = -1;
+			for (int i = 0; i < names.size(); i++) {
+				if (strcmp(names[i], c_name) == 0) {
+					idx = i;
+					break;
+				}
+			}
+			names.push_back(c_name);
+			Common::String name;
+			// some systems have the same voice multiple times for the same language (e.g. en-US and en-GB),
+			// in that case we should add the locale to the name
+			if (idx == -1) {
+				name = Common::String(c_name);
+			} else {
+				name = Common::String::format("%s (%s)", c_name, language.substr(3, 2).c_str());
+				// some systems have identical name/language/locale pairs multiple times (seems a bug), we just skip that case (e.g. macOS Safari for "Samantha (en_US)" )
+				char *other_name = ((char **)_ttsState->_availableVoices[idx].getData())[0];
+				char *other_lang = ((char **)_ttsState->_availableVoices[idx].getData())[1];
+				Common::String other_new = Common::String::format("%s (%s)", other_name, Common::String(other_lang).substr(3, 2).c_str());
+				if (other_new == name) {
+					warning("Skipping duplicate voice %s %s", c_name, c_lang);
+					continue;
+				} else {
+					warning("Adding duplicate voice %s %s", _ttsState->_availableVoices[idx].getDescription().c_str(), name.c_str());
+					_ttsState->_availableVoices[idx].setDescription(other_new);
+				}
+			}
+			char **data_p = new char *[] { c_name, c_lang };
+			Common::TTSVoice voice(Common::TTSVoice::UNKNOWN_GENDER, Common::TTSVoice::UNKNOWN_AGE, (void *)data_p, name);
+			_ttsState->_availableVoices.push_back(voice);
+		}
+	}
+	free(ttsVoices);
+
+	if (_ttsState->_availableVoices.empty()) {
+		warning("No voice is available for language: %s", _ttsState->_language.c_str());
+	}
+}
+
+EM_JS(void, _ttsStop, (), {
+	window.speechSynthesis.cancel();
+});
+
+bool EmscriptenTextToSpeechManager::stop() {
+	_ttsStop();
+	return true;
+}
+
+EM_ASYNC_JS(void, _ttsPause, (), {
+	if(window.speechSynthesis.paused){
+	} else if(window.speechSynthesis.speaking && globalThis['ttsUtteranceQueue'].length > 0){
+		// browsers don't pause immediately, so we have to wait for the pause event if there's something being spoken
+		await (async () => {
+			return new Promise((resolve, reject) => {
+				setTimeout(() => { resolve(); }, 300);
+				globalThis['ttsUtteranceQueue'][0].onpause = (event) =>{ resolve(event)};
+				window.speechSynthesis.pause();
+			});
+		})();
+	} else {
+		assert(globalThis['ttsUtteranceQueue'].length == 0);
+		window.speechSynthesis.pause();
+	}
+	return;
+});
+
+bool EmscriptenTextToSpeechManager::pause() {
+	if (isPaused())
+		return false;
+	_ttsPause();
+	return true;
+}
+
+EM_JS(void, _ttsResume, (), {
+	window.speechSynthesis.resume();
+});
+
+bool EmscriptenTextToSpeechManager::resume() {
+	if (!isPaused())
+		return false;
+	_ttsResume();
+	return true;
+}
+
+EM_JS(bool, _ttsIsSpeaking, (), {
+	return window.speechSynthesis.speaking;
+});
+
+bool EmscriptenTextToSpeechManager::isSpeaking() {
+	return _ttsIsSpeaking();
+}
+
+EM_JS(bool, _ttsIsPaused, (), {
+	console.debug("_ttsIsPaused: Checking if speech synthesis is paused %s",window.speechSynthesis.paused ? "true" : "false");
+	return window.speechSynthesis.paused;
+});
+
+bool EmscriptenTextToSpeechManager::isPaused() {
+	return _ttsIsPaused();
+}
+
+bool EmscriptenTextToSpeechManager::isReady() {
+	if (_ttsState->_availableVoices.empty())
+		return false;
+	if (!isPaused() && !isSpeaking())
+		return true;
+	else
+		return false;
+}
+
+void EmscriptenTextToSpeechManager::setVoice(unsigned index) {
+	assert(!_ttsState->_enabled || index < _ttsState->_availableVoices.size());
+	_ttsState->_activeVoice = index;
+	return;
+}
+
+void EmscriptenTextToSpeechManager::setRate(int rate) {
+	assert(rate >= -100 && rate <= 100);
+	_ttsState->_rate = rate;
+}
+
+void EmscriptenTextToSpeechManager::setPitch(int pitch) {
+	assert(pitch >= -100 && pitch <= 100);
+	_ttsState->_pitch = pitch;
+}
+
+void EmscriptenTextToSpeechManager::setVolume(unsigned volume) {
+	assert(volume <= 100);
+	_ttsState->_volume = volume;
+}
+
+void EmscriptenTextToSpeechManager::setLanguage(Common::String language) {
+	debug(5, "EmscriptenTextToSpeechManager::setLanguage to %s", language.c_str());
+	if (_ttsState->_language != language.substr(0, 2) || _ttsState->_availableVoices.empty()) {
+		debug(5, "EmscriptenTextToSpeechManager::setLanguage - Update voices");
+		updateVoices();
+		setVoice(0);
+	}
+	Common::TextToSpeechManager::setLanguage(language);
+}
+
+void EmscriptenTextToSpeechManager::freeVoiceData(void *data) {
+	free(((char **)data)[0]);
+	free(((char **)data)[1]);
+	free(data);
+}
+
+#endif
diff --git a/backends/text-to-speech/emscripten/emscripten-text-to-speech.h b/backends/text-to-speech/emscripten/emscripten-text-to-speech.h
new file mode 100644
index 00000000000..abb18ad224c
--- /dev/null
+++ b/backends/text-to-speech/emscripten/emscripten-text-to-speech.h
@@ -0,0 +1,69 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef BACKENDS_TEXT_TO_SPEECH_EMSCRIPTEN_H
+#define BACKENDS_TEXT_TO_SPEECH_EMSCRIPTEN_H
+
+#include "common/scummsys.h"
+
+#if defined(USE_TTS) && defined(EMSCRIPTEN)
+
+#include "common/list.h"
+#include "common/str.h"
+#include "common/text-to-speech.h"
+#include "common/ustr.h"
+
+class EmscriptenTextToSpeechManager final : public Common::TextToSpeechManager {
+public:
+	EmscriptenTextToSpeechManager();
+	~EmscriptenTextToSpeechManager() override;
+
+	bool say(const Common::U32String &str, Action action) override;
+
+	bool stop() override;
+	bool pause() override;
+	bool resume() override;
+
+	bool isSpeaking() override;
+	bool isPaused() override;
+	bool isReady() override;
+
+	void setVoice(unsigned index) override;
+
+	void setRate(int rate) override;
+
+	void setPitch(int pitch) override;
+
+	void setVolume(unsigned volume) override;
+
+	void setLanguage(Common::String language) override;
+
+	void freeVoiceData(void *data) override;
+
+	void updateVoicesPublic() { updateVoices(); };
+
+private:
+	void updateVoices() override;
+};
+
+#endif
+
+#endif // BACKENDS_TEXT_TO_SPEECH_EMSCRIPTEN_H
diff --git a/common/text-to-speech.h b/common/text-to-speech.h
index f9747d3e4d8..8cec91ea3fc 100644
--- a/common/text-to-speech.h
+++ b/common/text-to-speech.h
@@ -120,6 +120,11 @@ class TTSVoice {
 		 */
 		String getDescription() { return _description; };
 
+		/**
+		 * Sets the voice description, in case it has to be changed after creation.
+		 */
+		void setDescription(String description) { _description = description; };
+
 	protected:
 		Gender _gender; ///< Gender of the voice
 		Age _age; ///< Age of the voice
diff --git a/configure b/configure
index a95cc0c1e6e..551d5013b27 100755
--- a/configure
+++ b/configure
@@ -5245,6 +5245,11 @@ int main(void) { return 0; }
 EOF
 			cc_check -ObjC++ -lobjc && _tts=yes
 			;;
+		
+		emscripten)
+			# Emscripten has the "Web Speech API" available
+			_tts=yes
+			;;
 	esac
 fi
 echo "$_tts"
@@ -7096,6 +7101,10 @@ else
 		echo "osx"
 		_tts=yes
 		;;
+	emscripten)
+		echo "Web Speech API"
+		_tts=yes
+		;;
 	*)
 		echo "no"
 		_tts=no