Multimodal Integration with OpenAI

Nov 14 2024 · Python 3.12, OpenAI 1.52, JupyterLab, Visual Studio Code

Lesson 05: Building a Multimodal AI App

Demo of Generating Situational Prompts & Images

Episode complete

Play next episode

Next

Heads up... You’re accessing parts of this content for free, with some sections shown as obfuscated text.

Heads up... You’re accessing parts of this content for free, with some sections shown as obfuscated text.

Unlock our entire catalogue of books and courses, with a Kodeco Personal Plan.

Unlock now

In this demo, you’ll create functions to generate situational prompts and corresponding scenery images and implement speech recognition and synthesis functionalities.

# Function to generate a situational prompt for practicing English
def generate_situational_prompt(seed_prompt=""):
    # Define additional prompt instructions
    additional_prompt = """
    Then create an initial response to the person. If the situation
      is "ordering coffee in a cafe.", then the initial response will
      be, "Hello, what would you like to order?". Separate the initial
      situation and the initial response with a line containing "====".
      Something like:
        "You're ordering coffee in a cafe.
        ====
        'Hello, there. What would you like to order?'"
        Limit the output to 1 sentence.
    """
    # Check if a seed prompt is provided and create the seed
    # phrase accordingly
    if seed_prompt:
        seed_phrase = f"""Generate a second-person POV situation
          for practicing English with this seed prompt: {seed_prompt}.
        {additional_prompt}"""
    else:
        seed_phrase = f"""Generate a second-person POV situation
          for practicing English, like meeting your parents-in-law,
          etc.
        {additional_prompt}"""
    # Use GPT to generate a situation for practicing English
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        {"role": "system", "content": "You are a creative writer.
          Very very creative."},
        {"role": "user", "content": seed_phrase}
      ]
    )
    # Extract and return the situation and the initial response
    # from the response
    message = response.choices[0].message.content

    # Return the generated message
    return message
# Test the function to generate a situational prompt
generate_situational_prompt()
# Test the function to generate a situational prompt with a seed prompt
generate_situational_prompt("comics exhibition")
# Generate an image based on the situational prompt

# Import necessary libraries for image processing and display
import requests
from PIL import Image
from io import BytesIO

def generate_situation_image(dalle_prompt):
    # Generate an image using the DALL-E 3 model with the provided prompt
    response = client.images.generate(
      model="dall-e-3", # Specify the model to use
      prompt=dalle_prompt, # The prompt describing the image to generate
      size="1024x1024", # Specify the size of the generated image
      n=1, # Number of images to generate
    )

    # Retrieve the URL of the generated image
    image_url = response.data[0].url

    # Download the image from the URL
    response = requests.get(image_url)

     # Open the image using PIL
    img = Image.open(BytesIO(response.content))

    # Return the image object
    return img
# Display the image in the cell
import matplotlib.pyplot as plt

# Display the image in the cell
def display_image(img):
    plt.imshow(img)
    plt.axis('off')
    plt.show()
# Combine the functions to generate a situational prompt and
# its matching image
full_response = generate_situational_prompt("cafe")
initial_situation_prompt = full_response.split('====')[0].strip()
print(initial_situation_prompt)
img = generate_situation_image(initial_situation_prompt)
display_image(img)
# Play the audio file

# Import necessary libraries for audio processing and display
import librosa
from IPython.display import Audio, display

# Function to play a speech file
def play_speech(file_path):
    # Load the audio file using librosa
    y, sr = librosa.load(file_path)

    # Create an Audio object for playback
    audio = Audio(data=y, rate=sr, autoplay=True)

    # Display the audio player
    display(audio)
# Function to generate speech from a text prompt
def speak_prompt(speech_prompt, autoplay=True,
  speech_file_path="speech.mp3"):
    # Generate speech from the grammar feedback using TTS
    response = client.audio.speech.create(
      model="tts-1",
      voice="alloy",
      input=speech_prompt
    )

    # Save the synthesized speech to the specified path
    response.stream_to_file(speech_file_path)

    # Sometimes you want to play the speech automatically,
    # sometimes you do not
    if autoplay:
        # Play the synthesized speech
        play_speech(speech_file_path)
# Play the initial response based on the situational prompt
initial_situation = full_response.split('====')[1].strip()
speak_prompt(initial_situation)
# Function to transcribe speech from an audio file
def transcript_speech(speech_filename="my_speech.wav"):
    with open(speech_filename, "rb") as audio_file:
        # Transcribe the audio file using the Whisper model
        transcription = client.audio.transcriptions.create(
          model="whisper-1",
          file=audio_file,
          response_format="json",
          language="en"
        )
    # Return the transcribed text
    return transcription.text
# Transcribe the audio
transcripted_text = transcript_speech("audio/cappuccino.m4a")

# Print the transcribed text
print(transcripted_text)
# Function to create a conversation history
def creating_conversation_history(history, added_response):
    history = f"""{history}
====
'{added_response}'
"""
    return history
# Create and print the conversation history
history = creating_conversation_history(full_response, transcripted_text)
print(history)
# Function to generate a conversation based on the conversation history
def generate_conversation_from_history(history):
    prompt = """Continue conversation from a person based on this
      conversation history and end it with '\n====\n'.
      Limit it to max 3 sentences.
      This is the history:"""
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        {"role": "system", "content": "You are a creative writer.
          Very very creative."},
        {"role": "user", "content": f"{prompt}\n{history}"}
      ]
    )
    # Extract and return the generated conversation
    message = response.choices[0].message.content
    return message
# Generate and print the conversation based on the history
conversation = generate_conversation_from_history(history)
print(conversation)
# Combine the conversation history with the new conversation
combined_history = history + "\n====\n" + conversation

# Print the combined history
print(combined_history)
# Generate a scenery image based on the combined history
dalle_prompt = "Generate a scenery based on this conversation: "
  + combined_history
img = generate_situation_image(dalle_prompt)

# Display the generated image
display_image(img)
# Generate and play the prompt based on the new conversation
speak_prompt(conversation)
See forum comments
Cinema mode Download course materials from Github
Previous: Generating Situational Prompts & Images Next: Building the User Interface with Gradio