Speech To Text in Realtime

September 17, 2022


So you want to transcribe audio from your microphone in real-time with a readily available API with a good response time?

Setting up the Server

1mkdir server && cd server
2npm init -y
3npm i -D typescript @tsconfig/node16 concurrently @types/pumpify @types/node @types/express
4npx tsc --init
5npm i socket.io express

Next, we open the tsconfig.json file generated by the above command. Make sure it look something like this:

 2    "extends": "@tsconfig/node16/tsconfig.json",
 3    "compilerOptions": 
 4    {     
 5        "target": "es5",
 6        "module": "commonjs",
 7        "outDir": "dist",
 8        "strict": true,
 9        "esModuleInterop": true,
10        "skipLibCheck": true,
11        "forceConsistentCasingInFileNames": true
12     },
13    "include": ["src"],
14    "exclude": ["node_modules"]

Let’s set up TypeScript to watch for code changes and transpile it to JavaScript on the fly with concurrently.

Add the following script to your package.json

1"scripts": {
2  "dev": "concurrently  \"npx tsc --watch\" \"nodemon -q ./dist/index.js\"

Now to start the server, simply run npm run dev{lang="zsh"}.

Coding the server.ts file

This is a basic express server boilerplate you can use to get up and running.

 1import express, { Express, Request, Response } from "express";
 2import { createServer } from "https";
 3const app: Express = express();
 4const server = createServer(
 5  {
 6    key: fs.readFileSync("key.pem"),
 7    cert: fs.readFileSync("cert.pem"),
 8  },
 9  app
11const port = 3000;
12const io = new Server(server, {
13  cors: {
14    origin: "*",
15  },
18server.listen(port, () => {
19  console.log(`⚡️[server]: Server is running at https://localhost:${port}`);

Next step, is to set up socket.io and add some eventListeners.

 1import { google } from "@google-cloud/speech/build/protos/protos";
 2import { Server, Socket } from "socket.io";
 3import speechToTextUtils from "./speechToTextUtils";
 5io.on("connection", (socket: Socket) => {
 6  console.log("Socket Connection: ", socket.connected);
 7  console.log("Socket Id: ", socket.id);
 8  speechToTextUtils._socket = socket;
10  // Define socket eventListeners.
11  socket.on(
12    "startGoogleCloudStream",
13    (request: google.cloud.speech.v1.IStreamingRecognitionConfig) => {
14      // Very long type, but alas that's what Google decided to use.
15      speechToTextUtils._request = request;
16      console.log("Starting Google Cloud Transcription");
17      speechToTextUtils.startRecognitionStream();
18    }
19  );
21  // Receive audio data from front-end
22  socket.on("binaryAudioData", (data: DataView) => {
23    speechToTextUtils.receiveData(data);
24  });
26  // End the audio stream
27  socket.on("endGoogleCloudStream", () => {
28    speechToTextUtils.stopRecognitionStream();
29  });

Let’s go over what the eventListeners do here:

  • startGoogleCloudStream: As the name says, this initializes the speechToTextUtils class and sets up the options necessary which we will look at soon.

  • binaryAudioData: This receives the audio data from the front-end as binary int16 buffers. This is processed by the receiveData function in speechToTextUtils.

  • endGoogleCloudStream: This calls a helper function that turns off all eventListeners and shuts down the transcription.

Now, let's look at the speechToTextUtils.ts file.

  1import speech, { SpeechClient } from '@google-cloud/speech';
  2import { google } from '@google-cloud/speech/build/protos/protos';
  3import * as pumpify from 'pumpify';
  4import chalk from 'chalk';
  5import { Socket } from 'socket.io';
  6let speechClient: SpeechClient | null = null;
  8class SpeechToTextUtils {
  9	recognizeStream!: pumpify | null;
 10	resultEndTime = 0;
 11	isFinalEndTime = 0;
 12	finalRequestEndTime = 0;
 13	bridgingOffset = 0;
 14	streamingLimit = 290000;
 15	restartCounter = 0;
 16	lastTranscriptWasFinal = false;
 17	audioInput: DataView[] = [];
 18	lastAudioInput: DataView[] = [];
 19	newStream = true;
 20	socket!: Socket;
 21	request!: google.cloud.speech.v1.IStreamingRecognitionConfig | undefined;
 22	restartTimeout: NodeJS.Timeout | undefined;
 24	set _socket(value: Socket) {
 25		this.socket = value;
 26	}
 28	set _request(value: google.cloud.speech.v1.IStreamingRecognitionConfig) {
 29		this.request = value;
 30	}
 32	startRecognitionStream() {
 33		this.audioInput = [];
 34		if (!speechClient) {
 35			speechClient = new speech.SpeechClient(); // Creates a client
 36		}
 37		this.recognizeStream = speechClient
 38			.streamingRecognize(this.request)
 39			.on('error', (err) => {
 40				console.error('Error when processing audio: ' + err);
 41				this.socket.emit('googleCloudStreamError', err);
 42				this.stopRecognitionStream();
 43			})
 44			.on('data', this.speechCallback.bind(this));
 46		this.restartTimeout = setTimeout(
 47			this.restartStream.bind(this),
 48			this.streamingLimit
 49		);
 50	}
 52	speechCallback(stream: google.cloud.speech.v1.StreamingRecognizeResponse) {
 53		// Null checks
 54		if (
 55			stream.results &&
 56			stream.results[0] &&
 57			stream.results[0].resultEndTime &&
 58			stream.results[0].resultEndTime.nanos &&
 59			stream.results[0].resultEndTime.seconds &&
 60			stream.results[0].alternatives &&
 61			stream.results[0].isFinal
 62		) {
 63			// Convert API result end time from seconds + nanoseconds to milliseconds
 64			// The below seconds are useful to see the timestamps in the console
 65			let seconds: number;
 66			if (typeof stream.results[0].resultEndTime.seconds === 'string')
 67				seconds = parseInt(stream.results[0].resultEndTime.seconds);
 68			else if (Long.isLong(stream.results[0].resultEndTime.seconds))
 69				seconds = stream.results[0].resultEndTime.seconds.toNumber();
 70			else seconds = stream.results[0].resultEndTime.seconds;
 71			this.resultEndTime =
 72				seconds * 1000 +
 73				Math.round(stream.results[0].resultEndTime.nanos / 1000000);
 75			// Calculate correct time based on offset from audio sent twice
 76			const correctedTime =
 77				this.resultEndTime -
 78				this.bridgingOffset +
 79				this.streamingLimit * this.restartCounter;
 81			process.stdout.clearLine(0);
 82			process.stdout.cursorTo(0);
 83			let stdoutText = '';
 84			if (stream.results[0] && stream.results[0].alternatives[0]) {
 85				stdoutText =
 86					correctedTime + ': ' + stream.results[0].alternatives[0].transcript;
 87			}
 89			if (stream.results[0].isFinal) {
 90				process.stdout.write(chalk.green(`${stdoutText}\n`));
 91				this.socket.emit(
 92					'speechData',
 93					stream.results[0].alternatives[0].transcript
 94				);
 96				this.isFinalEndTime = this.resultEndTime;
 97				this.lastTranscriptWasFinal = true;
 98			} else {
 99				// Make sure transcript does not exceed console character length
100				if (stdoutText.length > process.stdout.columns) {
101					stdoutText =
102						stdoutText.substring(0, process.stdout.columns - 4) + '...';
103				}
104				process.stdout.write(chalk.red(`${stdoutText}`));
106				this.lastTranscriptWasFinal = false;
107			}
108		}
109	}
112export default new SpeechToTextUtils();

We have a couple of setters which help us retain the socket instance and the request config.

The startRecognitionStream method initializes the cloud speech client with the request config and binds the speechCallback which is triggered everytime the Google API returns a transcription response.

We use chalk to highlight the timestamps and the transcriptions live in red, and green once it is final.

The restartTimeout triggers the restartStream method once the timer has reached about 5 minutes as this is the max continuous recognition limit Google has set for the Speech-To-Text API.

How is the input audio handled? We’ll look at that next.

 2   * Receives streaming data and writes it to the recognizeStream for transcription
 3   *
 4   * @param {Buffer} data A section of audio data
 5   */
 6  receiveData(data: DataView) {
 7    if (
 8      this.newStream &&
 9      this.lastAudioInput.length !== 0 &&
10      this.recognizeStream
11    ) {
12      // Approximate math to calculate time of chunks
13      const chunkTime = this.streamingLimit / this.lastAudioInput.length;
14      if (chunkTime !== 0) {
15        if (this.bridgingOffset < 0) {
16          this.bridgingOffset = 0;
17        }
18        if (this.bridgingOffset > this.finalRequestEndTime) {
19          this.bridgingOffset = this.finalRequestEndTime;
20        }
21        const chunksFromMS = Math.floor(
22          (this.finalRequestEndTime - this.bridgingOffset) / chunkTime
23        );
24        this.bridgingOffset = Math.floor(
25          (this.lastAudioInput.length - chunksFromMS) * chunkTime
26        );
28        for (let i = chunksFromMS; i < this.lastAudioInput.length; i++) {
29          this.recognizeStream.write(this.lastAudioInput[i]);
30        }
31      }
32      this.newStream = false;
33    }
35    this.audioInput.push(data);
37    if (this.recognizeStream) {
38      this.recognizeStream.write(data);
39    }
40  }

This function receiveData is from Google’s node client library Github Repo. It writes the binary chunks of audio data received into streamingRecognize.

 1  restartStream() {
 2    if (this.recognizeStream) {
 3      this.recognizeStream.end();
 4      this.recognizeStream.removeAllListeners();
 5      this.recognizeStream = null;
 6    }
 7    if (this.resultEndTime > 0) {
 8      this.finalRequestEndTime = this.isFinalEndTime;
 9    }
10    this.resultEndTime = 0;
12    this.lastAudioInput = [];
13    this.lastAudioInput = this.audioInput;
15    this.restartCounter++;
17    if (!this.lastTranscriptWasFinal) {
18      process.stdout.write("\n");
19    }
20    process.stdout.write(
21      chalk.yellow(
22        `${this.streamingLimit * this.restartCounter}: RESTARTING REQUEST\n`
23      )
24    );
26    this.newStream = true;
28    this.startRecognitionStream();
29  }
31  /**
32   * Closes the recognize stream and wipes it
33   */
34  stopRecognitionStream() {
35    if (this.recognizeStream) {
36      this.recognizeStream.end();
37			this.recognizeStream.removeAllListeners();
38    }
39    if (this.restartTimeout) {
40      clearTimeout(this.restartTimeout);
41    }
42    this.recognizeStream = null;
43  }

Google has a default timeout of five minutes for any streaming recognition API request. In order to perform transcriptions for longer than that limit, we use these two functions provided in the samples given by Google.

It is imperative that we remove all event listeners, otherwise Node will freak out.

Moving on to the front-end now.

Setting up the Client side

Run these commands to set up the angular project

1mkdir client && cd client
2npx ng new speech-to-text --directory=.
3npm install @google-cloud/speech socket.io-client

You can customize the app.component files as you like, but the basic template should have the socket connection as follows.

 1import { io, Socket } from 'socket.io-client';
 2import {AudioStreamer} from 'src/app/app.service';
 4constructor(private audioStreamer: AudioStreamer) {
 5  this.liveTranscription = '';
 6  this.socket = io('https://localhost:3000');
 7  this.socket.on('connect', () => {
 8    console.log('Socket Connection: ', this.socket?.connected);
 9  })
10  this.audioStreamer._socket = this.socket;
12startRecording() {
13  console.log('startRecording');
14  if (!this.isRecording) {
15    this.liveTranscription = '';
16  }
17  this.isRecording = true;
18  this.audioStreamer.initRecording((data: string) => {
19    this.liveTranscription += ( data + '. ');
20  }, (error) => {
21    console.error('Error when processing audio');
22  })
24stopRecording() {
25  this.isRecording = false;
26  console.log('stopRecording');
27  this.audioStreamer.stopRecording();

You can use the liveTranscription variable to output the transcription as html through Angular’s data binding.

  1import { Injectable } from '@angular/core';
  2import { Socket } from 'socket.io-client';
  4// Stream Audio
  5let bufferSize = 2048,
  6  AudioContext;
  8//audioStream constraints
  9const constraints = {
 10  audio: true,
 11  video: false,
 15  providedIn: 'root',
 17export class AudioStreamer {
 18  input!: MediaStreamAudioSourceNode | null;
 19  globalStream: MediaStream | undefined;
 20  processor!: AudioWorkletNode | null;
 21  audioContext!: AudioContext | null;
 22  socket!: Socket;
 24  set _socket(value: Socket) {
 25    this.socket = value;
 26  }
 27  /**
 28   * @param {function} onData Callback to run on data each time it's received
 29   * @param {function} onError Callback to run on an error if one is emitted.
 30   */
 31  initRecording(onData: (arg0: any) => void, onError: (arg0: string) => void) {
 32    this.socket.emit('startGoogleCloudStream', {
 33      config: {
 34        encoding: 'LINEAR16',
 35        sampleRateHertz: 16000,
 36        languageCode: 'en-US',
 37        profanityFilter: false,
 38        enableWordTimeOffsets: true,
 39      },
 40      interimResults: true, // If you want interim results, set this to true
 41    }); //init socket Google Speech Connection
 42    AudioContext = window.AudioContext;
 43    this.audioContext = new AudioContext({
 44      latencyHint: 'interactive'
 45    });
 47    const handleSuccess = async (stream: MediaStream) => {
 48      this.globalStream = stream;
 49      this.input = this.audioContext!.createMediaStreamSource(stream);
 50      await this.audioContext!.audioWorklet.addModule(
 51        '/assets/audio-processor.js'
 52      );
 53      this.audioContext!.resume();
 54      this.processor = new AudioWorkletNode(
 55        this.audioContext!,
 56        'recorder.worklet'
 57      );
 58      this.processor.connect(this.audioContext!.destination);
 59      this.audioContext!.resume();
 60      this.input.connect(this.processor);
 62      this.processor.port.onmessage = (event: MessageEvent<ArrayBufferLike>) => {
 63        const audioData = event.data;
 64        this.sendAudio(audioData);
 65      }
 67    };
 69    navigator.mediaDevices.getUserMedia(constraints).then(handleSuccess);
 71    // Bind the data handler callback
 72    if (onData) {
 73      this.socket.on('speechData', (data) => {
 74        onData(data);
 75      });
 76    }
 78    this.socket.on('googleCloudStreamError', (error) => {
 79      if (onError) {
 80        onError('error');
 81      }
 82      // We don't want to emit another end stream event
 83      this.closeAll();
 84    });
 85  }
 87  sendAudio(buffer: ArrayBufferLike) {
 88    this.socket.emit('binaryAudioData', buffer);
 89  }
 91  stopRecording() {
 92    this.socket.emit('endGoogleCloudStream', '');
 93    this.closeAll();
 94  }
 96  /**
 97   * Stops recording and closes everything down. Runs on error or on stop.
 98   */
 99  closeAll() {
100    // Clear the listeners (prevents issue if opening and closing repeatedly)
101    this.socket.off('speechData');
102    this.socket.off('googleCloudStreamError');
103    let tracks = this.globalStream ? this.globalStream.getTracks() : null;
104    let track = tracks ? tracks[0] : null;
105    if (track) {
106      track.stop();
107    }
109    if (this.processor) {
110      if (this.input) {
111        try {
112          this.input.disconnect(this.processor);
113        } catch (error) {
114          console.warn('Attempt to disconnect input failed.');
115        }
116      }
117      this.processor.disconnect(this.audioContext!.destination);
118    }
119    if (this.audioContext) {
120      this.audioContext.close().then(() => {
121        this.input = null;
122        this.audioContext = null;
123        AudioContext = null;
124      });
125    }
126  }
129export default AudioStreamer;

We use the AudioContext API to process our microphone audio and send it to the backend via the socket connection. Google’s STT API expects audio in 16bit Integers, in order to convert this, we need to attach a custom audio processor that will resample our microphone input.

NOTE: Make sure the audio-processor.js file is stored in assets folder otherwise Angular will not detect it.
Since the AudioProcessor is added as a worklet, it is detached from the main execution thread. So, we cannot send the data to the socket from the audio processor file. Instead, we make use of the inbuilt messaging function to send the down sampled bytes back to the service file.

Now we can send the audio to the socket.

Using an EventListener we can wait for the transcribed strings to be sent from the server and deal with them as we wish. Print them on the screen? Live subtitles for your videos? Anything is possible.

Even something like this: Interview Warmup — Grow with Google

That concludes this article, thank you for reading this far ❤️

This article should give you a rundown on the detailed code provided here: GitHub