c# - 如何使用 Watson Unity SDK 制作语音到文本的自定义模型?

标签 c# unity-game-engine speech-to-text ibm-watson watson

我使用 Watson Assistant、Unity 中的语音转文本和文本转语音制作了一个应用程序,用户可以说出不同的城市来查找所述城市之间的可用机票。对话和互动效果很好,但有时我会遇到这样的问题:当用户说出某些城市时,它们无法被识别。比如柏林,有时它理解柏林,有时它理解时间燃烧。巴黎、伦敦和 Jakarta 等其他城市也是如此。

因此,城市名称的检测并不总是像我希望的那样准确。但我在一些帖子中看到,您可以制作自己的自定义模型来改进这些单词的检测。但我不知道如何设置它,制作自己的自定义模型以及如何将这些城市添加到模型中并训练它。是否可以在 Unity C# 脚本中做到这一点?我将如何开始?有一些我可以看的 C# 示例吗?任何帮助将不胜感激。

这些是我找到的一些链接和信息,但不知道如何在 C# 中实现它以及出于我自己提高城市检测准确性的目的。

DwAnswers1 DwAnswers2 StackOverflow IBM clouds docs Medium cURL tutorial

这是我用于 Watson API 和 Unity 之间交互的 C# 脚本。我想我也必须在此处添加自定义模型,但我不知道是否也应该在其中创建自定义模型,或者是否需要在单独的脚本中。

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using IBM.Watson.DeveloperCloud.Services.TextToSpeech.v1;
using IBM.Watson.DeveloperCloud.Services.Conversation.v1;
using IBM.Watson.DeveloperCloud.Services.ToneAnalyzer.v3;
using IBM.Watson.DeveloperCloud.Services.SpeechToText.v1;
using IBM.Watson.DeveloperCloud.Logging;
using IBM.Watson.DeveloperCloud.Utilities;
using IBM.Watson.DeveloperCloud.Connection;
using IBM.Watson.DeveloperCloud.DataTypes;
using MiniJSON;
using UnityEngine.UI;
using FullSerializer;

public class WatsonAgent : MonoBehaviour
{

public string literalEntityCity;
public string destinationCity;
public string departureCity;

public string dateBegin;
public string dateEnd;

public WeatherJSON weather;
public GameObject FlightInfo;

[SerializeField]
private fsSerializer _serializer = new fsSerializer();

[System.Serializable]
public class CredentialInformation
{
    public string username, password, url;
}

[System.Serializable]
public class Services
{
    public CredentialInformation
        textToSpeech,
        conversation,
        speechToText;
}

[Header("Credentials")]
[Space]
public Services
    serviceCredentials;

[Space]
[Header("Agent voice settings")]
[Space]
public AudioSource
    voiceSource;

public VoiceType
    voiceType;

[Space]
[Header("Conversation settings")]
[Space]
public string
    workspaceId;

[Space]
[Header("Feedback fields")]
[Space]
public Text
    speechToTextField;
public Text
    conversationInputField;
public Text
    conversationOutputField;

public string
    saying;

// services
SpeechToText
    speechToText;

private int
    recordingRoutine = 0,
    recordingBufferSize = 1,
    recordingHZ = 22050;

private string
    microphoneID = null;

private AudioClip
    recording = null;

TextToSpeech
    textToSpeech;

Conversation
    conversation;

private Dictionary<string, object>
    conversationContext = null;

private void Start()
{
    PrepareCredentials();
    Initialize();
}

void PrepareCredentials()
{
    speechToText = new SpeechToText(GetCredentials(serviceCredentials.speechToText));
    textToSpeech = new TextToSpeech(GetCredentials(serviceCredentials.textToSpeech));
    conversation = new Conversation(GetCredentials(serviceCredentials.conversation));
}

Credentials GetCredentials(CredentialInformation credentialInformation)
{
    return new Credentials(credentialInformation.username, credentialInformation.password, credentialInformation.url);
}

void Initialize()
{
    conversation.VersionDate = "2017-05-26";
    Active = true;
    StartRecording();
}

// speech to text
public bool Active
{
    get { return speechToText.IsListening; }
    set
    {
        if (value && !speechToText.IsListening)
        {
            speechToText.DetectSilence = true;
            speechToText.EnableWordConfidence = true;
            speechToText.EnableTimestamps = true;
            speechToText.SilenceThreshold = 0.01f;
            speechToText.MaxAlternatives = 0;
            speechToText.EnableInterimResults = true;
            speechToText.OnError = OnSpeechError;
            speechToText.InactivityTimeout = -1;
            speechToText.ProfanityFilter = false;
            speechToText.SmartFormatting = true;
            speechToText.SpeakerLabels = false;
            speechToText.WordAlternativesThreshold = null;
            speechToText.StartListening(OnSpeechRecognize);
            //speechToText.CustomizationId = "customID";    // I guess i have to add the custom training model here with the customID
            //speechToText.CustomizationWeight(0.2);        //
        }
        else if (!value && speechToText.IsListening)
        {
            speechToText.StopListening();
        }
    }
}

private void StartRecording()
{
    if (recordingRoutine == 0)
    {
        UnityObjectUtil.StartDestroyQueue();
        recordingRoutine = Runnable.Run(RecordingHandler());
    }
}

private void StopRecording()
{
    if (recordingRoutine != 0)
    {
        Microphone.End(microphoneID);
        Runnable.Stop(recordingRoutine);
        recordingRoutine = 0;
    }
}

private void OnSpeechError(string error)
{
    Active = false;

    Log.Debug("ExampleStreaming.OnError()", "Error! {0}", error);
}

private IEnumerator RecordingHandler()
{
    recording = Microphone.Start(microphoneID, true, recordingBufferSize, recordingHZ);
    yield return null;      // let _recordingRoutine get set..

    if (recording == null)
    {
        StopRecording();
        yield break;
    }

    bool bFirstBlock = true;
    int midPoint = recording.samples / 2;
    float[] samples = null;

    while (recordingRoutine != 0 && recording != null)
    {
        int writePos = Microphone.GetPosition(microphoneID);
        if (writePos > recording.samples || !Microphone.IsRecording(microphoneID))
        {
            Debug.Log("Microphone disconnected.");
            StopRecording();
            yield break;
        }

        if ((bFirstBlock && writePos >= midPoint) || (!bFirstBlock && writePos < midPoint))
        {
            // front block is recorded, make a RecordClip and pass it onto our callback.
            samples = new float[midPoint];
            recording.GetData(samples, bFirstBlock ? 0 : midPoint);

            AudioData record = new AudioData();
            record.MaxLevel = Mathf.Max(Mathf.Abs(Mathf.Min(samples)), Mathf.Max(samples));
            record.Clip = AudioClip.Create("Recording", midPoint, recording.channels, recordingHZ, false);
            record.Clip.SetData(samples, 0);

            speechToText.OnListen(record);

            bFirstBlock = !bFirstBlock;
        }
        else
        {
            // calculate the number of samples remaining until we ready for a block of audio, 
            // and wait that amount of time it will take to record.
            int remaining = bFirstBlock ? (midPoint - writePos) : (recording.samples - writePos);
            float timeRemaining = (float)remaining / (float)recordingHZ;

            yield return new WaitForSeconds(timeRemaining);
        }
    }

    yield break;
}

private void OnSpeechRecognize(SpeechRecognitionEvent result, Dictionary<string, object> customData)
{
    if (result != null && result.results.Length > 0)
    {
        foreach (var res in result.results)
        {
            foreach (var alt in res.alternatives)
            {

                string text = string.Format("{0} ({1}, {2:0.00})\n", alt.transcript, res.final ? "Final" : "Interim", alt.confidence);

                if (speechToTextField != null)
                {
                    speechToTextField.text = text;
                }

                if (res.final)
                {
                    if (characterState == SocialState.listening)
                    {
                        Debug.Log("WATSON | Speech to text recorded: \n" + alt.transcript);
                        StartCoroutine(Message(alt.transcript));
                    }
                }
                else
                {
                    if (characterState == SocialState.idle)
                    {
                        characterState = SocialState.listening;
                    }
                }
            }
        }
    }
}


// text to speech
private IEnumerator Synthesize(string text)
{
    Debug.Log("WATSON CALL | Synthesize input: \n" + text);

    textToSpeech.Voice = voiceType;
    bool doSynthesize = textToSpeech.ToSpeech(HandleSynthesizeCallback, OnFail, text, true);

    if (doSynthesize)
    {
        StartCoroutine(Analyze(text));
        saying = text;
        characterState = SocialState.talking;
    }
    yield return null;
}

void HandleSynthesizeCallback(AudioClip clip, Dictionary<string, object> customData = null)
{
    if (Application.isPlaying && clip != null)
    {
        voiceSource.clip = clip;
        voiceSource.Play();
    }
}

// conversation
private IEnumerator Message(string text)
{
    Debug.Log("WATSON | Conversation input: \n" + text);

    MessageRequest messageRequest = new MessageRequest()
    {
        input = new Dictionary<string, object>()
        {
            { "text", text }
        },
        context = conversationContext
    };
    bool doMessage = conversation.Message(HandleMessageCallback, OnFail, workspaceId, messageRequest);

    if (doMessage)
    {
        characterState = SocialState.thinking;

        if (conversationInputField != null)
        {
            conversationInputField.text = text;
        }
    }

    yield return null;
}

void HandleMessageCallback(object resp, Dictionary<string, object> customData)
{
    object _tempContext = null;
    (resp as Dictionary<string, object>).TryGetValue("context", out _tempContext);

    if (_tempContext != null)
        conversationContext = _tempContext as Dictionary<string, object>;
    string contextList = conversationContext.ToString();

    Dictionary<string, object> dict = Json.Deserialize(customData["json"].ToString()) as Dictionary<string, object>;
    Dictionary<string, object> output = dict["output"] as Dictionary<string, object>;
    Debug.Log("JSON INFO: " + customData["json"].ToString());

    // Send new/update context variables to the Watson Conversation Service
    if (weather.temperatureCity != null && !conversationContext.ContainsKey("temperature"))
    {
        string currentTemperature = weather.temperatureNumber.ToString();
        conversationContext.Add("temperature", currentTemperature);
    }
    else if (conversationContext.ContainsKey("temperature"))
    {
        string currentTemperature = weather.temperatureNumber.ToString();
        conversationContext.Remove("temperature");
        conversationContext.Add("temperature", currentTemperature);
        //Debug.Log("Current Temperature: " + currentTemperature);
    }

    // $ call context variables
    var context = dict["context"] as Dictionary<string, object>;
    if (context["destination_city"] != null)
    {
        destinationCity = context["destination_city"].ToString();
        Debug.Log("Destination city: " + destinationCity);
    }
    if (context["departure_city"] != null)
    {
        departureCity = context["departure_city"].ToString();
    }

    List<object> text = output["text"] as List<object>;
    string answer = text[0].ToString(); //Geeft alleen de eerste response terug

    Debug.Log("WATSON | Conversation output: \n" + answer);

    if (conversationOutputField != null)
    {
        conversationOutputField.text = answer;
    }

    fsData fsdata = null;
    fsResult r = _serializer.TrySerialize(resp.GetType(), resp, out fsdata);
    if (!r.Succeeded)
    {
        throw new WatsonException(r.FormattedMessages);
    }

    //convert fsdata to MessageResponse
    MessageResponse messageResponse = new MessageResponse();
    object obj = messageResponse;
    r = _serializer.TryDeserialize(fsdata, obj.GetType(), ref obj);
    if (!r.Succeeded)
    {
        throw new WatsonException(r.FormattedMessages);
    }

    if (resp != null)
    {
        //Recognize intents & entities
        if (messageResponse.intents.Length > 0 && messageResponse.entities.Length > 0)
        {
            string intent = messageResponse.intents[0].intent;
            string entity = messageResponse.entities[0].entity;
            string literalEntity = messageResponse.entities[0].value;
            if (entity == "city")
            {
                literalEntityCity = literalEntity;
            }
            if (intent == "weather" && entity == "city")
            {
                literalEntityCity = literalEntity;
            }
        }
        if (messageResponse.intents.Length > 0)
        {
            string intent = messageResponse.intents[0].intent;
            //Debug.Log("Intent: " + intent);                           //intent name
        }
        if (messageResponse.entities.Length > 0)
        {
            string entity = messageResponse.entities[0].entity;
            //Debug.Log("Entity: " + entity);                             //entity name
            string literalEntity = messageResponse.entities[0].value;
            //Debug.Log("Entity Literal: " + literalEntity);                //literal spoken entity
            if (entity == "city")
            {
                literalEntityCity = literalEntity;
            }
        }
    }

    StartCoroutine(Synthesize(answer));
}
}

最佳答案

你问的问题相当复杂。我相信如果你训练一个模型,它应该使用 Watson 的工具,而不是与 Unity 相关的工具。

但是,您在 Unity 中可以做的是更正返回字。也就是说,如果您希望只获得城市名称,您可以下载所有城市的列表,假设有超过 100.000 名居民(您已经可以在互联网上找到这个),然后检查返回的单词是否在这份 list 。例如:

http://download.geonames.org/export/dump/

如果不是,您可以认为 Watson 检测不到它,因此您可以使用编辑距离之类的东西来纠正返回的单词。检查this

基本上,该算法试图找出两个单词的不同之处。可以使用其他算法来检查给定单词,该单词与列表中最相似。您可以从 here 得到一些想法或其他 one

关于c# - 如何使用 Watson Unity SDK 制作语音到文本的自定义模型?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50367052/

相关文章:

c# - 防止未使用的属性从对象传递的最佳方法

c# - 有没有更有效的方法来处理 C# ASP.NET(尤其是 MVC 5)上的亚马逊产品广告 API?

javascript - 如何在 js Web 应用程序中获取音频或视频通话的记录? IE。如何将 MediaStream 路由到语音到文本 API

c# - 使用 SemanticResultKey 时出现 TargetInvocationException

java - 如何使用 CMU Sphinx 4 通过英语 voxforge 模型进行语音转文本

c# - 为什么 WhereSelectArrayIterator 不实现 ICollection?

C#:从txt文件中读取数据

c# - Unity 5 游戏对象序列化

c# - 在 Unity 中关闭打开的套接字

c# - Steam VR 场景在 Unity 中重新加载时停止响应