Speech To Text in Realtime
September 17, 2022
So you want to transcribe audio from your microphone in real-time with a readily available API with a good response time?
Setting up the Server
1mkdir server && cd server
2npm init -y
3npm i -D typescript @tsconfig/node16 concurrently @types/pumpify @types/node @types/express
4npx tsc --init
5npm i socket.io express
Next, we open the tsconfig.json
file generated by the above command. Make sure it look something like this:
1{
2 "extends": "@tsconfig/node16/tsconfig.json",
3 "compilerOptions":
4 {
5 "target": "es5",
6 "module": "commonjs",
7 "outDir": "dist",
8 "strict": true,
9 "esModuleInterop": true,
10 "skipLibCheck": true,
11 "forceConsistentCasingInFileNames": true
12 },
13 "include": ["src"],
14 "exclude": ["node_modules"]
15}
Let’s set up TypeScript to watch for code changes and transpile it to JavaScript on the fly with concurrently
.
Add the following script to your package.json
1"scripts": {
2 "dev": "concurrently \"npx tsc --watch\" \"nodemon -q ./dist/index.js\"
3}
Now to start the server, simply run npm run dev
{lang="zsh"}.
Coding the server.ts
file
This is a basic express server boilerplate you can use to get up and running.
1import express, { Express, Request, Response } from "express";
2import { createServer } from "https";
3const app: Express = express();
4const server = createServer(
5 {
6 key: fs.readFileSync("key.pem"),
7 cert: fs.readFileSync("cert.pem"),
8 },
9 app
10);
11const port = 3000;
12const io = new Server(server, {
13 cors: {
14 origin: "*",
15 },
16});
17
18server.listen(port, () => {
19 console.log(`⚡️[server]: Server is running at https://localhost:${port}`);
20});
Next step, is to set up socket.io
and add some eventListeners
.
1import { google } from "@google-cloud/speech/build/protos/protos";
2import { Server, Socket } from "socket.io";
3import speechToTextUtils from "./speechToTextUtils";
4
5io.on("connection", (socket: Socket) => {
6 console.log("Socket Connection: ", socket.connected);
7 console.log("Socket Id: ", socket.id);
8 speechToTextUtils._socket = socket;
9
10 // Define socket eventListeners.
11 socket.on(
12 "startGoogleCloudStream",
13 (request: google.cloud.speech.v1.IStreamingRecognitionConfig) => {
14 // Very long type, but alas that's what Google decided to use.
15 speechToTextUtils._request = request;
16 console.log("Starting Google Cloud Transcription");
17 speechToTextUtils.startRecognitionStream();
18 }
19 );
20
21 // Receive audio data from front-end
22 socket.on("binaryAudioData", (data: DataView) => {
23 speechToTextUtils.receiveData(data);
24 });
25
26 // End the audio stream
27 socket.on("endGoogleCloudStream", () => {
28 speechToTextUtils.stopRecognitionStream();
29 });
30});
Let’s go over what the eventListeners
do here:
-
startGoogleCloudStream
: As the name says, this initializes thespeechToTextUtils
class and sets up the options necessary which we will look at soon. -
binaryAudioData
: This receives the audio data from the front-end as binary int16 buffers. This is processed by thereceiveData
function inspeechToTextUtils
. -
endGoogleCloudStream
: This calls a helper function that turns off all eventListeners and shuts down the transcription.
Now, let's look at the speechToTextUtils.ts
file.
1import speech, { SpeechClient } from '@google-cloud/speech';
2import { google } from '@google-cloud/speech/build/protos/protos';
3import * as pumpify from 'pumpify';
4import chalk from 'chalk';
5import { Socket } from 'socket.io';
6let speechClient: SpeechClient | null = null;
7
8class SpeechToTextUtils {
9 recognizeStream!: pumpify | null;
10 resultEndTime = 0;
11 isFinalEndTime = 0;
12 finalRequestEndTime = 0;
13 bridgingOffset = 0;
14 streamingLimit = 290000;
15 restartCounter = 0;
16 lastTranscriptWasFinal = false;
17 audioInput: DataView[] = [];
18 lastAudioInput: DataView[] = [];
19 newStream = true;
20 socket!: Socket;
21 request!: google.cloud.speech.v1.IStreamingRecognitionConfig | undefined;
22 restartTimeout: NodeJS.Timeout | undefined;
23
24 set _socket(value: Socket) {
25 this.socket = value;
26 }
27
28 set _request(value: google.cloud.speech.v1.IStreamingRecognitionConfig) {
29 this.request = value;
30 }
31
32 startRecognitionStream() {
33 this.audioInput = [];
34 if (!speechClient) {
35 speechClient = new speech.SpeechClient(); // Creates a client
36 }
37 this.recognizeStream = speechClient
38 .streamingRecognize(this.request)
39 .on('error', (err) => {
40 console.error('Error when processing audio: ' + err);
41 this.socket.emit('googleCloudStreamError', err);
42 this.stopRecognitionStream();
43 })
44 .on('data', this.speechCallback.bind(this));
45
46 this.restartTimeout = setTimeout(
47 this.restartStream.bind(this),
48 this.streamingLimit
49 );
50 }
51
52 speechCallback(stream: google.cloud.speech.v1.StreamingRecognizeResponse) {
53 // Null checks
54 if (
55 stream.results &&
56 stream.results[0] &&
57 stream.results[0].resultEndTime &&
58 stream.results[0].resultEndTime.nanos &&
59 stream.results[0].resultEndTime.seconds &&
60 stream.results[0].alternatives &&
61 stream.results[0].isFinal
62 ) {
63 // Convert API result end time from seconds + nanoseconds to milliseconds
64 // The below seconds are useful to see the timestamps in the console
65 let seconds: number;
66 if (typeof stream.results[0].resultEndTime.seconds === 'string')
67 seconds = parseInt(stream.results[0].resultEndTime.seconds);
68 else if (Long.isLong(stream.results[0].resultEndTime.seconds))
69 seconds = stream.results[0].resultEndTime.seconds.toNumber();
70 else seconds = stream.results[0].resultEndTime.seconds;
71 this.resultEndTime =
72 seconds * 1000 +
73 Math.round(stream.results[0].resultEndTime.nanos / 1000000);
74
75 // Calculate correct time based on offset from audio sent twice
76 const correctedTime =
77 this.resultEndTime -
78 this.bridgingOffset +
79 this.streamingLimit * this.restartCounter;
80
81 process.stdout.clearLine(0);
82 process.stdout.cursorTo(0);
83 let stdoutText = '';
84 if (stream.results[0] && stream.results[0].alternatives[0]) {
85 stdoutText =
86 correctedTime + ': ' + stream.results[0].alternatives[0].transcript;
87 }
88
89 if (stream.results[0].isFinal) {
90 process.stdout.write(chalk.green(`${stdoutText}\n`));
91 this.socket.emit(
92 'speechData',
93 stream.results[0].alternatives[0].transcript
94 );
95
96 this.isFinalEndTime = this.resultEndTime;
97 this.lastTranscriptWasFinal = true;
98 } else {
99 // Make sure transcript does not exceed console character length
100 if (stdoutText.length > process.stdout.columns) {
101 stdoutText =
102 stdoutText.substring(0, process.stdout.columns - 4) + '...';
103 }
104 process.stdout.write(chalk.red(`${stdoutText}`));
105
106 this.lastTranscriptWasFinal = false;
107 }
108 }
109 }
110}
111
112export default new SpeechToTextUtils();
We have a couple of setters which help us retain the socket instance and the request config.
The startRecognitionStream
method initializes the cloud speech client with the request config and binds the speechCallback
which is triggered everytime the Google API returns a transcription response.
We use chalk
to highlight the timestamps and the transcriptions live in red, and green once it is final.
The restartTimeout triggers the restartStream method once the timer has reached about 5 minutes as this is the max continuous recognition limit Google has set for the Speech-To-Text API.
How is the input audio handled? We’ll look at that next.
1/**
2 * Receives streaming data and writes it to the recognizeStream for transcription
3 *
4 * @param {Buffer} data A section of audio data
5 */
6 receiveData(data: DataView) {
7 if (
8 this.newStream &&
9 this.lastAudioInput.length !== 0 &&
10 this.recognizeStream
11 ) {
12 // Approximate math to calculate time of chunks
13 const chunkTime = this.streamingLimit / this.lastAudioInput.length;
14 if (chunkTime !== 0) {
15 if (this.bridgingOffset < 0) {
16 this.bridgingOffset = 0;
17 }
18 if (this.bridgingOffset > this.finalRequestEndTime) {
19 this.bridgingOffset = this.finalRequestEndTime;
20 }
21 const chunksFromMS = Math.floor(
22 (this.finalRequestEndTime - this.bridgingOffset) / chunkTime
23 );
24 this.bridgingOffset = Math.floor(
25 (this.lastAudioInput.length - chunksFromMS) * chunkTime
26 );
27
28 for (let i = chunksFromMS; i < this.lastAudioInput.length; i++) {
29 this.recognizeStream.write(this.lastAudioInput[i]);
30 }
31 }
32 this.newStream = false;
33 }
34
35 this.audioInput.push(data);
36
37 if (this.recognizeStream) {
38 this.recognizeStream.write(data);
39 }
40 }
This function receiveData
is from Google’s node client library Github Repo. It writes the binary chunks of audio data received into streamingRecognize
.
1 restartStream() {
2 if (this.recognizeStream) {
3 this.recognizeStream.end();
4 this.recognizeStream.removeAllListeners();
5 this.recognizeStream = null;
6 }
7 if (this.resultEndTime > 0) {
8 this.finalRequestEndTime = this.isFinalEndTime;
9 }
10 this.resultEndTime = 0;
11
12 this.lastAudioInput = [];
13 this.lastAudioInput = this.audioInput;
14
15 this.restartCounter++;
16
17 if (!this.lastTranscriptWasFinal) {
18 process.stdout.write("\n");
19 }
20 process.stdout.write(
21 chalk.yellow(
22 `${this.streamingLimit * this.restartCounter}: RESTARTING REQUEST\n`
23 )
24 );
25
26 this.newStream = true;
27
28 this.startRecognitionStream();
29 }
30
31 /**
32 * Closes the recognize stream and wipes it
33 */
34 stopRecognitionStream() {
35 if (this.recognizeStream) {
36 this.recognizeStream.end();
37 this.recognizeStream.removeAllListeners();
38 }
39 if (this.restartTimeout) {
40 clearTimeout(this.restartTimeout);
41 }
42 this.recognizeStream = null;
43 }
Google has a default timeout of five minutes for any streaming recognition API request. In order to perform transcriptions for longer than that limit, we use these two functions provided in the samples given by Google.
It is imperative that we remove all event listeners, otherwise Node will freak out.
Moving on to the front-end now.
Setting up the Client side
Run these commands to set up the angular project
1mkdir client && cd client
2npx ng new speech-to-text --directory=.
3npm install @google-cloud/speech socket.io-client
You can customize the app.component
files as you like, but the basic template should have the socket connection as follows.
1import { io, Socket } from 'socket.io-client';
2import {AudioStreamer} from 'src/app/app.service';
3
4constructor(private audioStreamer: AudioStreamer) {
5 this.liveTranscription = '';
6 this.socket = io('https://localhost:3000');
7 this.socket.on('connect', () => {
8 console.log('Socket Connection: ', this.socket?.connected);
9 })
10 this.audioStreamer._socket = this.socket;
11}
12startRecording() {
13 console.log('startRecording');
14 if (!this.isRecording) {
15 this.liveTranscription = '';
16 }
17 this.isRecording = true;
18 this.audioStreamer.initRecording((data: string) => {
19 this.liveTranscription += ( data + '. ');
20 }, (error) => {
21 console.error('Error when processing audio');
22 })
23}
24stopRecording() {
25 this.isRecording = false;
26 console.log('stopRecording');
27 this.audioStreamer.stopRecording();
28}
You can use the liveTranscription
variable to output the transcription as html through Angular’s data binding.
1import { Injectable } from '@angular/core';
2import { Socket } from 'socket.io-client';
3
4// Stream Audio
5let bufferSize = 2048,
6 AudioContext;
7
8//audioStream constraints
9const constraints = {
10 audio: true,
11 video: false,
12};
13
14@Injectable({
15 providedIn: 'root',
16})
17export class AudioStreamer {
18 input!: MediaStreamAudioSourceNode | null;
19 globalStream: MediaStream | undefined;
20 processor!: AudioWorkletNode | null;
21 audioContext!: AudioContext | null;
22 socket!: Socket;
23
24 set _socket(value: Socket) {
25 this.socket = value;
26 }
27 /**
28 * @param {function} onData Callback to run on data each time it's received
29 * @param {function} onError Callback to run on an error if one is emitted.
30 */
31 initRecording(onData: (arg0: any) => void, onError: (arg0: string) => void) {
32 this.socket.emit('startGoogleCloudStream', {
33 config: {
34 encoding: 'LINEAR16',
35 sampleRateHertz: 16000,
36 languageCode: 'en-US',
37 profanityFilter: false,
38 enableWordTimeOffsets: true,
39 },
40 interimResults: true, // If you want interim results, set this to true
41 }); //init socket Google Speech Connection
42 AudioContext = window.AudioContext;
43 this.audioContext = new AudioContext({
44 latencyHint: 'interactive'
45 });
46
47 const handleSuccess = async (stream: MediaStream) => {
48 this.globalStream = stream;
49 this.input = this.audioContext!.createMediaStreamSource(stream);
50 await this.audioContext!.audioWorklet.addModule(
51 '/assets/audio-processor.js'
52 );
53 this.audioContext!.resume();
54 this.processor = new AudioWorkletNode(
55 this.audioContext!,
56 'recorder.worklet'
57 );
58 this.processor.connect(this.audioContext!.destination);
59 this.audioContext!.resume();
60 this.input.connect(this.processor);
61
62 this.processor.port.onmessage = (event: MessageEvent<ArrayBufferLike>) => {
63 const audioData = event.data;
64 this.sendAudio(audioData);
65 }
66
67 };
68
69 navigator.mediaDevices.getUserMedia(constraints).then(handleSuccess);
70
71 // Bind the data handler callback
72 if (onData) {
73 this.socket.on('speechData', (data) => {
74 onData(data);
75 });
76 }
77
78 this.socket.on('googleCloudStreamError', (error) => {
79 if (onError) {
80 onError('error');
81 }
82 // We don't want to emit another end stream event
83 this.closeAll();
84 });
85 }
86
87 sendAudio(buffer: ArrayBufferLike) {
88 this.socket.emit('binaryAudioData', buffer);
89 }
90
91 stopRecording() {
92 this.socket.emit('endGoogleCloudStream', '');
93 this.closeAll();
94 }
95
96 /**
97 * Stops recording and closes everything down. Runs on error or on stop.
98 */
99 closeAll() {
100 // Clear the listeners (prevents issue if opening and closing repeatedly)
101 this.socket.off('speechData');
102 this.socket.off('googleCloudStreamError');
103 let tracks = this.globalStream ? this.globalStream.getTracks() : null;
104 let track = tracks ? tracks[0] : null;
105 if (track) {
106 track.stop();
107 }
108
109 if (this.processor) {
110 if (this.input) {
111 try {
112 this.input.disconnect(this.processor);
113 } catch (error) {
114 console.warn('Attempt to disconnect input failed.');
115 }
116 }
117 this.processor.disconnect(this.audioContext!.destination);
118 }
119 if (this.audioContext) {
120 this.audioContext.close().then(() => {
121 this.input = null;
122 this.audioContext = null;
123 AudioContext = null;
124 });
125 }
126 }
127}
128
129export default AudioStreamer;
We use the AudioContext API
to process our microphone audio and send it to the backend via the socket connection. Google’s STT API expects audio in 16bit Integers, in order to convert this, we need to attach a custom audio processor that will resample our microphone input.
NOTE: Make sure the audio-processor.js
file is stored in assets folder otherwise Angular will not detect it.
Since the AudioProcessor
is added as a worklet, it is detached from the main execution thread. So, we cannot send the data to the socket from the audio processor file. Instead, we make use of the inbuilt messaging function to send the down sampled bytes back to the service file.
Now we can send the audio to the socket.
Using an EventListener
we can wait for the transcribed strings to be sent from the server and deal with them as we wish. Print them on the screen? Live subtitles for your videos? Anything is possible.
Even something like this: Interview Warmup — Grow with Google
That concludes this article, thank you for reading this far ❤️
This article should give you a rundown on the detailed code provided here: GitHub