Skip to content

Latest commit

 

History

History
444 lines (325 loc) · 7.84 KB

File metadata and controls

444 lines (325 loc) · 7.84 KB

Speech Recognition and Translation Using Whisper

What is Whisper?

How to Get Started?

mkvirtualenv -p python3.9 whisper-learning
pip install -U openai-whisper==20231117
apt update
apt install ffmpeg
pacman -S ffmpeg
brew install ffmpeg
choco install ffmpeg
scoop install ffmpeg
pip install setuptools-rust==1.8.1
# Create a folder
mkdir -p src/whisper/audio
# Download the audio file
wget \
https://upload.wikimedia.org/\
wikipedia/commons/7/75/\
Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg \
-O src/whisper/audio/Winston_Church.ogg
whisper \
src/whisper/audio/Winston_Church.ogg \
--model base
# Using the medium model
whisper \
src/whisper/audio/Winston_Church.ogg \
--model medium

# Using the large model
whisper \
src/whisper/audio/Winston_Church.ogg \
--model large
[00:00.000 --> 00:07.680]  I speak to you for the first time as Prime Minister [..]
[00:08.320 --> 00:14.880]  of our empire, of our allies, and above all of the [..]
[00:16.640 --> 00:20.240]  A tremendous battle is raging in France and flanders [..]
[00:21.920 --> 00:27.920]  The Germans by a remarkable combination of air bombing [..]
[.....]
[.....]
[.....]
[03:16.400 --> 03:22.000]  of the specialized and mechanized forces of the enemy [..]
[03:22.640 --> 03:26.560]  and we know that very heavy losses have been inflicted [..]
[.....]
[.....]

Transcribe and Translate

wget \
https://upload.wikimedia.org/\
wikipedia/commons/1/1a/\
Lenin_-_What_Is_Soviet_Power.ogg \
-O src/whisper/audio/Lenin.ogg
whisper \
src/whisper/audio/Lenin.ogg \
--language Russian \
--task translate
[00:00.000 --> 00:02.000]  What is Soviet power?
[00:02.000 --> 00:06.000]  What is the essence of this new power,
[00:06.000 --> 00:11.000]  which cannot be understood in most countries?
[00:11.000 --> 00:15.000]  The essence of it is attracting workers.
[00:15.000 --> 00:19.000]  In each country, more and more people stand up
[00:19.000 --> 00:22.000]  to the Prime Minister of the State,
[00:22.000 --> 00:25.000]  such as other rich or capitalists,
[00:25.000 --> 00:29.000]  and now for the first time control the state.
whisper \
src/whisper/audio/Lenin.ogg \
--language Russian \
--task translate \
--best_of 20
[00:00.000 --> 00:06.200]  what Soviet power is, what is the integrity of these new authorities,
[00:06.280 --> 00:11.140]  which cannot, or do not, successfully resolve in a government?
[00:11.540 --> 00:15.540]  The integrity of it attracts its workers,
[00:15.660 --> 00:19.260]  the Garze country is getting more and more,
[00:19.320 --> 00:25.580]  with the greater state control than other rich or capitalists.
[00:25.680 --> 00:29.020]  This is the first time the ..
whisper -h

Using Whisper SDK in Python

cat << EOF > src/whisper/whisper_test.py
import whisper

# Adjust model size as needed 
# (e.g., "tiny", "small", "medium", "large")
model_name = "base"  

try:
    # Load the model
    model = whisper.load_model(model_name)
except Exception as e:
    print(f"Error loading model '{model_name}': {e}")
    exit(1)

audio_file_path = "src/whisper/audio/Winston_Church.ogg"

try:
    # Transcribe the audio
    result = model.transcribe(audio_file_path)
    print(f'The text in video:')
    print(result['text'])
except Exception as e:
    print(f"Error transcribing file '{audio_file_path}': {e}")
EOF
python src/whisper/whisper_test.py
cat << EOF > src/whisper/whisper_translate.py
import whisper

# Adjust model size as needed 
# (e.g., "tiny", "small", "medium", "large")
model_name = "medium"  

try:
    # Load the model
    model = whisper.load_model(model_name)
except Exception as e:
    print(f"Error loading model '{model_name}': {e}")
    exit(1)

audio_file_path = "src/whisper/audio/Winston_Church.ogg"

try:
    # Transcribe the audio to arabic
    result = model.transcribe(
        audio_file_path, 
        language="ar"
    )
    print("Arabic:")
    print(result)    
    # Transcribe the audio to chinese
    result = model.transcribe(
        audio_file_path, 
        language="zh"
    )
    print("Chinese:")
    print(result)
    # ..etc
except Exception as e:
    print(f"Error transcribing audio '{audio_file_path}': {e}")
EOF
python src/whisper/whisper_translate.py

Using OpenAI Speech to Text API

Transcription API

workon openaigptforpythondevelopers
import os
from openai import OpenAI

with open("src/.env") as env:
    for line in env:
        key, value = line.strip().split("=")
        os.environ[key] = value

client = OpenAI(
    api_key=os.environ['API_KEY'], 
    organization=os.environ['ORG_ID']
)
API_KEY=sk-...
ORG_ID=org-...
cat << EOF > src/transcribe.py
# Authenticate
from api import client
import os

# Define the audio file path
audio_file_path = os.path.abspath(
    "src/whisper/audio/Lenin.ogg"
)
# Define the model
model = "whisper-1"

# Open the audio file in binary mode
with open(audio_file_path, 'rb') as audio_file:
    # Transcribe the audio
    transcript = client.audio.transcriptions.create(
        model=model,
        file=audio_file
    )

print(transcript.text)
EOF
python src/transcribe.py

Translation API

cat << EOF > src/translate.py
# Authenticate
from api import client
import os

# Define the audio file path
audio_file_path = os.path.abspath(
    "src/whisper/audio/Lenin.ogg"
)
# Define the model
model = "whisper-1"

# Open the audio file in binary mode
with open(audio_file_path, 'rb') as audio_file:
    # Transcribe the audio
    transcript = client.audio.translations.create(
        model=model,
        file=audio_file
    )

print(transcript.text)
EOF
python src/translate.py

Improving Whisper Transcription

Cleaning the Audio

Using the Prompt Parameter

We need an exit. You're not far from Cypher. Cypher?
We need an exit. You're not far from Cipher. Cipher?
cat << EOF > src/transcribe_with_prompt.py
# Authenticate
from api import client
import os

# Define the audio file path
audio_file_path = os.path.abspath(
    "src/whisper/audio/cypher.mp3"
)
# Define the model
model = "whisper-1"

# Define the prompt
prompt = "Cypher"
# Open the audio file in binary mode
with open(audio_file_path, 'rb') as audio_file:
    # Transcribe the audio
    transcript = client.audio.transcriptions.create(
        model=model,
        file=audio_file,
        prompt=prompt
    )

print(transcript.text)
EOF
We need an exit. You're not far from Cypher. Cypher?
We need an exit. You're not far from Cipher. Cipher?

Post-Processing the Transcription

cat << EOF > src/post_process.py
# Authenticate
from api import client
import os

# Define the audio file path
audio_file_path = os.path.abspath(
    "src/whisper/audio/cypher.mp3"
)
# Define the model
model = "whisper-1"

# Open the audio file in binary mode
with open(audio_file_path, 'rb') as audio_file:
    # Transcribe the audio
    transcript = client.audio.transcriptions.create(
        model=model,
        file=audio_file
    )

# Post-process the text using GPT-3
system_prompt = """You are a movie specialist. 
Your role is to correct the transcription of 
the audio file. For example, the word "Cipher" 
should be "Cypher"."""
# Define the model
model = "gpt-3.5-turbo"

# Post-process the text
response = client.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": transcript.text
        }
    ]
)

print(response.choices[0].message.content)
EOF
python src/post_process.py