Multimodal Integration with OpenAI

Nov 14 2024 · Python 3.12, OpenAI 1.52, JupyterLab, Visual Studio Code

Lesson 04: Speech Recognition & Synthesis

Demo of Speech Recognition and Synthesis Using Whisper & TTS

Episode complete

Play next episode

Next

Heads up... You’re accessing parts of this content for free, with some sections shown as obfuscated text.

Heads up... You’re accessing parts of this content for free, with some sections shown as obfuscated text.

Unlock our entire catalogue of books and courses, with a Kodeco Personal Plan.

Unlock now

To set up your development environment for using the OpenAI API, please refer to Lesson 1: Introduction to Multimodal AI. This lesson covers installing necessary libraries and configuring your environment.

# Install additional dependencies for this lesson
!pip install librosa
# Load the OpenAI library
from openai import OpenAI

# Set up relevant environment variables
# Make sure OPENAI_API_KEY=... exists in .env
from dotenv import load_dotenv

load_dotenv()

# Create the OpenAI connection object
client = OpenAI()
# Download and load an audio file using librosa

# Import libraries
import requests
import io
import librosa
from IPython.display import Audio, display

# URL of the sample audio file
speech_download_link = "https://cdn.pixabay.com/download/audio/2022/03/10/
  audio_a8e603753c.mp3?filename=self-destruct-sequence-31505.mp3"

# Local path where the audio file will be saved
save_path = "audio/self-destruct-sequence.mp3"

# Download the audio file
response = requests.get(speech_download_link)
if response.status_code == 200:
    audio_data = io.BytesIO(response.content)

    # Save the audio file locally
    with open(save_path, 'wb') as file:
        file.write(response.content)

    # Load the audio file using librosa
    y, sr = librosa.load(audio_data)

    # Display the audio file so it can be played
    audio = Audio(data=y, rate=sr, autoplay=True)
    display(audio)
import requests
import io
import librosa
from IPython.display import Audio, display
speech_download_link = "https://cdn.pixabay.com/download/audio/2022/03/10/
  audio_a8e603753c.mp3?filename=self-destruct-sequence-31505.mp3"
save_path = "audio/self-destruct-sequence.mp3"
response = requests.get(speech_download_link)
if response.status_code == 200:
    audio_data = io.BytesIO(response.content)
with open(save_path, 'wb') as file:
    file.write(response.content)
y, sr = librosa.load(audio_data)
audio = Audio(data=y, rate=sr, autoplay=True)
display(audio)
# Function to play the audio file

def play_speech(file_path):
    # Load the audio file using librosa
    y, sr = librosa.load(file_path)

    # Create an Audio object for playback
    audio = Audio(data=y, rate=sr, autoplay=True)

    # Display the audio player
    display(audio)
# Transcribe the audio file using the Whisper model

with open(save_path, "rb") as audio_file:
    # Transcribe the audio file using the Whisper model
    transcription = client.audio.transcriptions.create(
      model="whisper-1",
      file=audio_file,
      response_format="json"
    )
# Print the transcription result in JSON format
print(transcription.json())
# Print only the transcribed text
print(transcription.text)
# Retrieve the detailed information with timestamps

with open(save_path, "rb") as audio_file:
    # Transcribe the audio file with word-level timestamps
    transcription = client.audio.transcriptions.create(
      model="whisper-1",
      file=audio_file,
      response_format="verbose_json",
      timestamp_granularities=["word"]
    )
# Print the detailed information for each word timestamp

import json

json_result = transcription.json()
print(json_result)

json_object = json.loads(json_result)
print(json_object["text"])
# Print the detailed information for words

# Print the detailed information for each word
print(transcription.words)
# Print the detailed information for the first two words
print(transcription.words[0])
print(transcription.words[1])
# Retrieve the detailed information with segment-level timestamps

with open(save_path, "rb") as audio_file:
    # Transcribe the audio file with segment-level timestamps
    transcription = client.audio.transcriptions.create(
      model="whisper-1",
      file=audio_file,
      response_format="verbose_json",
      timestamp_granularities=["segment"]
    )
# Print the detailed information for the first two segments
print(transcription.segments[0])
print(transcription.segments[1])
# Load & play kodeco-speech.mp3 audio file

# Path to another audio file
ai_programming_audio_path = "audio/kodeco-speech.mp3"
# Play the audio file
play_speech(ai_programming_audio_path)
# Transcribe the audio file with `text` response format

with open(ai_programming_audio_path, "rb") as audio_file:
    # Transcribe the audio file to text
    transcription = client.audio.transcriptions.create(
      model="whisper-1",
      file=audio_file,
      response_format="text"
    )
# Print the transcribed text
print(transcription)
# Transcribe the audio file with a prompt to improve accuracy

with open(ai_programming_audio_path, "rb") as audio_file:
    # Transcribe the audio file with a prompt to improve accuracy
    transcription = client.audio.transcriptions.create(
      model="whisper-1",
      file=audio_file,
      response_format="text",
      prompt="Kodeco,RayWenderlich"
    )
# Print the transcribed text
print(transcription)
# Load & play japanese-speech.mp3 audio file

# The speech in Japanese: いらっしゃいませ。ラーメン屋へようこそ。
  何をご注文なさいますか?
# Path to the Japanese audio file
japanese_audio_path = "audio/japanese-speech.mp3"
# Play the Japanese audio file
play_speech(japanese_audio_path)
# Translate the Japanese audio to English text

with open(japanese_audio_path, "rb") as audio_file:
    # Translate the Japanese audio to English text
    translation = client.audio.translations.create(
      model="whisper-1",
      file=audio_file,
      response_format="text"
    )
# Print the translated text
print(translation)
# Generate speech from text using OpenAI's TTS model

# Path to save the synthesized speech
speech_file_path = "audio/learn-ai.mp3"

# Generate speech from text using OpenAI's TTS model
with client.audio.speech.with_streaming_response.create(
  model="tts-1",
  voice="alloy",
  input="Would you like to learn AI programming? We have many AI
    programming courses that you can choose."
) as response:
  # Save the synthesized speech to the specified path
  response.stream_to_file(speech_file_path)
# Play the synthesized speech
play_speech(speech_file_path)
# Generate speech with a different voice and slower speed
response = client.audio.speech.create(
  model="tts-1",
  voice="echo",
  speed=0.6,
  input="Would you like to learn AI programming? We have many
    AI programming courses that you can choose."
)

# Save the synthesized speech to the specified path
response.stream_to_file(speech_file_path)

# Play the synthesized speech
play_speech(speech_file_path)
DeprecationWarning: Due to a bug, this method doesn't actually stream the
  response content, `.with_streaming_response.method()` should be used
  instead response.stream_to_file(speech_file_path)
See forum comments
Cinema mode Download course materials from Github
Previous: Voice Transcription and Synthesis with Whisper & TTS Next: Demo of Designing a Basic Voice Interaction Feature in an App