我正在开发一个使用 SFSpeechRecognizer
的 iOS 项目,它在开始时运行良好。我说了一些话,它就回应了。但是一两分钟后,它就失败了。它不提供任何认可结果的反馈。
我想知道这是否与缓冲区有关,但我不知道如何解决。
我基本上是用SpeechRecognizer的demo来搭建工程的。不同的是我把识别出来的结果一个字一个字的存储在一个数组中。程序会分析数组并响应某些单词,例如“播放”或先前设置的其他一些命令。程序响应命令后,删除该数组元素。
话不多说,代码如下:
识别器,可以看到
supportedCommands
数组过滤了一些特定的词让程序响应。其他部分与https://developer.apple.com/library/content/samplecode/SpeakToMe/Listings/SpeakToMe_ViewController_swift.html#//apple_ref/doc/uid/TP40017110-SpeakToMe_ViewController_swift-DontLinkElementID_6的demo类似。class SpeechRecognizer: NSObject, SFSpeechRecognizerDelegate { private var speechRecognizer: SFSpeechRecognizer! private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest! private var recognitionTask: SFSpeechRecognitionTask! private let audioEngine = AVAudioEngine() private let locale = Locale(identifier: "en-US") private var lastSavedString: String = "" private let supportedCommands = ["more", "play"] var speechInputQueue: [String] = [String]() func load() { print("load") prepareRecognizer(locale: locale) authorize() } func start() { print("start") if !audioEngine.isRunning { try! startRecording() } } func stop() { if audioEngine.isRunning { audioEngine.stop() recognitionRequest?.endAudio() } } private func authorize() { SFSpeechRecognizer.requestAuthorization { authStatus in OperationQueue.main.addOperation { switch authStatus { case .authorized: print("Authorized!") case .denied: print("Unauthorized!") case .restricted: print("Unauthorized!") case .notDetermined: print("Unauthorized!") } } } } private func prepareRecognizer(locale: Locale) { speechRecognizer = SFSpeechRecognizer(locale: locale)! speechRecognizer.delegate = self } private func startRecording() throws { // Cancel the previous task if it's running. if let recognitionTask = recognitionTask { recognitionTask.cancel() self.recognitionTask = nil } let audioSession = AVAudioSession.sharedInstance() try audioSession.setCategory(AVAudioSessionCategoryPlayAndRecord, with: .defaultToSpeaker) try audioSession.setMode(AVAudioSessionModeDefault) try audioSession.setActive(true, with: .notifyOthersOnDeactivation) recognitionRequest = SFSpeechAudioBufferRecognitionRequest() let inputNode = audioEngine.inputNode guard let recognitionRequest = recognitionRequest else { fatalError("Unable to created a SFSpeechAudioBufferRecognitionRequest object") } // Configure request so that results are returned before audio recording is finished recognitionRequest.shouldReportPartialResults = true // A recognition task represents a speech recognition session. // We keep a reference to the task so that it can be cancelled. recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in var isFinal = false if let result = result { let temp = result.bestTranscription.formattedString.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines).lowercased() //print("temp", temp) if temp != self.lastSavedString && temp.count > self.lastSavedString.count { var tempSplit = temp.split(separator: " ") var lastSplit = self.lastSavedString.split(separator: " ") while lastSplit.count > 0 { if String(tempSplit[0]) == String(lastSplit[0]) { tempSplit.remove(at: 0) lastSplit.remove(at: 0) } else { break } } for command in tempSplit { if self.supportedCommands.contains(String(command)) { self.speechInputQueue.append(String(command)) } } self.lastSavedString = temp } isFinal = result.isFinal } if error != nil || isFinal { self.audioEngine.stop() inputNode.removeTap(onBus: 0) self.recognitionRequest = nil self.recognitionTask = nil } } let recordingFormat = inputNode.outputFormat(forBus: 0) inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in self.recognitionRequest?.append(buffer) } audioEngine.prepare() try audioEngine.start() } }
我们如何使用它:
if self.speechRecognizer.speechInputQueue.count > 0 { if self.speechRecognizer.speechInputQueue[0] == "more" { print("temp", temp) print("content", content) // isSpeakingContent = true self.textToSpeech(text: content) } else if self.speechRecognizer.speechInputQueue[0] == "play" { print("try to play") let soundURL = URL(fileURLWithPath: Bundle.main.path(forResource: "cascade", ofType: "wav")!) do { audioPlayer = try AVAudioPlayer(contentsOf: soundURL) } catch { print(error) } audioPlayer.prepareToPlay() audioPlayer.play() } else { self.textToSpeech(text: "unrecognized command") } self.speechRecognizer.speechInputQueue.remove(at: 0) print("after :", self.speechRecognizer.speechInputQueue) }
它响应某些命令并播放一些音频。
Buffer有问题吗?也许识别一两分钟后,缓冲区就满了?识别器只是随着时间的推移而失败。
最佳答案
来自 WWDC 2016 Session 509: Speech Recognition API :
For iOS 10 we're starting with a strict audio duration limit of about one minute which is similar to that of keyboard dictation.
关于ios - 几分钟后 SpeechRecognizer 失败,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49878238/