Login | Register 
FEATURES
APPLICATIONS
DOWNLOADS
EDUCATION
BLOG
WIKI
FORUM
STORE

Google Cloud Speech in TouchDesigner

Still a little green? Ask your beginner's questions here

Google Cloud Speech in TouchDesigner

Postby Niels_Lutken » Thu Jun 06, 2019 11:07 am

Hey, I'm a fairly new user to TouchDesigner, and have run into a problem with my project:

I'm currently working on a project where i need touch designer to react to certain, predefined spoken sentiments, and im therefore trying to implement a Google Cloud Speech script in TouchDesigner.

The code works outside of TD, with microphone handling done by PyAudio.

I presume i have to do the microphone handling through a AudioDeviceIn CHOP, when putting it together in TouchDesigner. Currently, when running the script, i get no output when speaking into the mic, yet i also don't receive any error messages. I assume that the script somehow isn't receiving the audio input.

For now i just want the output in textport.

Can someone help me out? here's my python code

Code: Select all
from __future__ import division

import re
import sys

from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from six.moves import queue

import os
credential_path = "/Path/to/googlecredentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

touchmic = op('audiodevin1')['chan1']


class MicrophoneStream(object):
    def __init__(self, rate, chunk):
        self._rate = rate
        self._chunk = chunk
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self):
        self._audio_interface = touchmic
        self._audio_stream = self._audio_interface.open(
            format=mono,
            channels=1, rate=self._rate,
            input=True, frames_per_buffer=self._chunk,
            stream_callback=self._fill_buffer,
        )

        self.closed = False
        return self

    def __exit__(self, type, value, traceback):
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        self._buff.put(in_data)
        return None, Continue

    def generator(self):
        while not self.closed:
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b''.join(data)


def listen_print_loop(responses):

    num_chars_printed = 0
    for response in responses:
        if not response.results:
            continue
        result = response.results[0]
        if not result.alternatives:
            continue

        transcript = result.alternatives[0].transcript
        overwrite_chars = ' ' * (num_chars_printed - len(transcript))

        if not result.is_final:
            sys.stdout.write(transcript + overwrite_chars + '\r')
            sys.stdout.flush()

            num_chars_printed = len(transcript)

        else:
            print(transcript + overwrite_chars)
            if re.search(r'\b(exit|quit)\b', transcript, re.I):
                print('Exiting..')
                break

            num_chars_printed = 0


def main():

    language_code = 'en-US'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        listen_print_loop(responses)


if __name__ == '__main__':
    main()
Niels_Lutken
 
Posts: 5
Joined: Wed Jun 05, 2019 8:18 am

Re: Google Cloud Speech in TouchDesigner

Postby matthewwachter » Thu Jun 06, 2019 12:49 pm

HI Niels,

This line...
Code: Select all
touchmic = op('audiodevin1')['chan1']

...would actually return a Channel Class Object which isn't the same as an audio device or microphone input. This object could be thought of as an array of floats (one for each sample if we're dealing with an audio channel).

https://docs.derivative.ca/Channel_Class


You could probably get this to work in TouchDesigner by re-writing the MicrophoneStream class to create your 'chunks' by buffering Channel Class values but personally I would just run this script externally and send TouchDesigner messages via TCP.

Here's an example of a python script that sends a message to TD (or any server really):
Code: Select all
    import json
    import socket

    # Create a TCP/IP socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    # Connect the socket to the port where the server is listening
    server_address = ('localhost', 8008)
    sock.connect(server_address)

    # Create the data and load it into json
    data = {
        'cmd': 'test',
        'data': ['foo', 'bar'],
    }
    msg = json.dumps(data)

    # Send the message
    sock.sendall(msg.encode('utf-8'))


So I think you could add this to your code like so:

Code: Select all
from __future__ import division

import json
import re
import socket
import sys

from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from six.moves import queue

import os
credential_path = "/Path/to/googlecredentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

mic = pyaudio.PyAudio()


class MicrophoneStream(object):
    def __init__(self, rate, chunk):
        self._rate = rate
        self._chunk = chunk
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self):
        self._audio_interface = mic
        self._audio_stream = self._audio_interface.open(
            format=mono,
            channels=1, rate=self._rate,
            input=True, frames_per_buffer=self._chunk,
            stream_callback=self._fill_buffer,
        )

        self.closed = False
        return self

    def __exit__(self, type, value, traceback):
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        self._buff.put(in_data)
        return None, Continue

    def generator(self):
        while not self.closed:
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b''.join(data)


def listen_print_loop(responses, sock):

    num_chars_printed = 0
    for response in responses:
        if not response.results:
            continue
        result = response.results[0]
        if not result.alternatives:
            continue

        transcript = result.alternatives[0].transcript
        overwrite_chars = ' ' * (num_chars_printed - len(transcript))

        if not result.is_final:
            sys.stdout.write(transcript + overwrite_chars + '\r')
            sys.stdout.flush()

            num_chars_printed = len(transcript)

        else:
            print(transcript + overwrite_chars)
           
            # send as a tcp message
            data = {
                'data': transcript + overwrite_chars
            }
            msg = json.dumps(data)
            sock.sendall(msg.encode('uft-8'))

            if re.search(r'\b(exit|quit)\b', transcript, re.I):
                print('Exiting..')
                break

            num_chars_printed = 0


def main():

    # Create a TCP/IP socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    # Connect the socket to the port where the server is listening
    server_address = ('localhost', 8008)
    sock.connect(server_address)

    language_code = 'en-US'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        listen_print_loop(responses, sock)


if __name__ == '__main__':
    main()


I haven't tested this out but I think it would work. Not really sure how you had the microphone input set up before. You'll just need to make a TCP DAT set to be a server for the script to connect to. Also, the row/callback format found in the DAT parameters should be set to "One For All Received Data."

Personally I would go this route so you don't end up with any issues where TD is left waiting for your script to process. There are a few While loops in there that might cause touch to hang if you were to run the script in a scene. You might be able to use the python threading module to deal with that but I think it would be a lot less trouble to just run this one externally.

If you'd like to read more about python TCP clients/servers you can check out this repo:
https://bitbucket.org/matthewwachter/tcp_threadedserver/src/master/
matthewwachter
 
Posts: 295
Joined: Fri Jul 08, 2011 3:18 pm
Location: Los Angeles

Re: Google Cloud Speech in TouchDesigner

Postby Niels_Lutken » Fri Jun 07, 2019 9:14 am

Thank you so much!
This fixed it for me - now on to the next step of my project.
I had made some errors in my initial code, that i also fixed. i'll paste it here, with your code included, for future reference:

Code: Select all
from __future__ import division

import json
import re
import socket
import sys

from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from six.moves import queue

import pyaudio

import os

credential_path = "/path/to/credentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms




class MicrophoneStream(object):
    def __init__(self, rate, chunk):
        self._rate = rate
        self._chunk = chunk
        self._buff = queue.Queue()
        self.closed = True

    def __enter__(self):
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            channels=1, rate=self._rate,
            input=True, frames_per_buffer=self._chunk,
            stream_callback=self._fill_buffer,
        )

        self.closed = False
        return self

    def __exit__(self, type, value, traceback):
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        self._buff.put(None)
        self._audio_interface.terminate()

    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        self._buff.put(in_data)
        return None, pyaudio.paContinue

    def generator(self):
        while not self.closed:
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b''.join(data)


def listen_print_loop(responses, sock):

    num_chars_printed = 0
    for response in responses:
        if not response.results:
            continue
        result = response.results[0]
        if not result.alternatives:
            continue

        transcript = result.alternatives[0].transcript
        overwrite_chars = ' ' * (num_chars_printed - len(transcript))

        if not result.is_final:
            sys.stdout.write(transcript + overwrite_chars + '\r')
            sys.stdout.flush()

            num_chars_printed = len(transcript)

        else:
            print(transcript + overwrite_chars)

            # send as a tcp message
            data = {
                'data': transcript + overwrite_chars
            }
            msg = json.dumps(data)+"\n"
            sock.sendall(msg.encode('utf-8'))

            if re.search(r'\b(exit|quit)\b', transcript, re.I):
                print('Exiting..')
                break

            num_chars_printed = 0


def main():

    # Create a TCP/IP socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    # Connect the socket to the port where the server is listening
    server_address = ('localhost', 8008)
    sock.connect(server_address)

    language_code = 'en-US'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        listen_print_loop(responses, sock)


if __name__ == '__main__':
    main()
Niels_Lutken
 
Posts: 5
Joined: Wed Jun 05, 2019 8:18 am


Return to Beginners

Who is online

Users browsing this forum: Bing [Bot], Majestic-12 [Bot], TimFranklin and 10 guests