Skip to content

Latest commit

 

History

History
1014 lines (828 loc) · 22.8 KB

File metadata and controls

1014 lines (828 loc) · 22.8 KB

Advanced Embedding Examples

Predicting Your Preferred Coffee

Review of a coffee blend A: "Delicate, sweetly spice-toned. The finish consolidates to notes of date and hazelnut with undertones of cedar."

Bold, richly aromatic with a hint of citrus. Dark chocolate, toasted walnut, orange blossom, cedar, and brown sugar in aroma and cup. Brightly sweet with a vibrant acidity; full, velvety mouthfeel. The finish centers on dark chocolate and walnut with a cedar undertone.

Review of coffee blend C:
Lively, tangy with a fruity essence. Raspberry, macadamia, gardenia, bamboo, molasses in aroma and cup. Sweet-tart structure with a brisk acidity; light, silky mouthfeel. The finish is a delightful blend of raspberry and macadamia, complemented by a bamboo note.

Review of coffee blend D:
Robust, earthy with a hint of smokiness. Black currant, hazelnut, hibiscus, oak, treacle in aroma and cup. Deeply sweet with a low acidity; smooth, thick mouthfeel. The finish combines black currant and hazelnut with an oak backdrop.

Review of coffee blend E:
Subtle, delicately spiced with a sweet tone. Date, almond, orchid, cedar, maple syrup in aroma and cup. Sweet-toned structure with gentle, rounded acidity; silky, satiny mouthfeel. The finish is a smooth interplay of date and almond with a cedar undercurrent.
workon openaigptforpythondevelopers
pip install nltk==3.8.1
cat << EOF > src/api.py
import os
from openai import OpenAI

with open("src/.env") as env:
    for line in env:
        key, value = line.strip().split("=")
        os.environ[key] = value

client = OpenAI(
api_key=os.environ['API_KEY'], 
organization=os.environ['ORG_ID']
)

def get_embedding(text, model):
    text = text.replace("\n", " ")
    return client.embeddings.create(
        input = [text], 
        model=model
    ).data[0].embedding
EOF
cat << EOF > src/utils.py
import pandas as pd
import numpy as np
import nltk

def cosine_similarity(a, b):
    numerator = np.dot(a, b)
    denominator = np.linalg.norm(a) * np.linalg.norm(b)
    return numerator / denominator

def download_nltk_data():
    # Check and download the 'punkt' tokenizer models
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    # Check and download the 'stopwords' corpus
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

def preprocess_text(text):
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from nltk.tokenize import word_tokenize    
    # Tokenize text
    tokens = word_tokenize(text)

    # Convert to lower case
    tokens = [
        word.lower() for word in tokens
    ]

    # Remove punctuation
    words = [
        word for word in tokens if word.isalpha()
    ]

    # Filter out stop words
    stop_words = set(
        stopwords.words('english')
    )
    words = [
        word for word in words if word not in stop_words
    ]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [
        stemmer.stem(word) for word in words
    ]

    return ' '.join(stemmed_words)
EOF
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# List of words to process
words = [
    'running', 
    'runner', 
    'jumps', 
    'easily', 
    'better'
]

# Stemming process
stemmed_words = [
    stemmer.stem(word) for word in words
]

# Lemmatization process with POS specification
# (POS: Part of Speech)
lemmatized_words = []
for word in words:
    # Default to noun
    pos_tag = 'n'
    if word in ['better']:
        # Treat this example as an adjective
        pos_tag = 'a' 
    elif word in ['running', 'jumps']: 
        # Treat these examples as verbs
        pos_tag = 'v'
    elif word in ['easily']:
        # Treat this example as an adverb
        pos_tag = 'r'
    else:
        # Treat all other examples as nouns
        pos_tag = 'n'
    lemmatized_word = lemmatizer.lemmatize(
        word, 
        pos=pos_tag
    )
    lemmatized_words.append(lemmatized_word)

# Print results
print("Original: ", words)
print("Stemmed: ", stemmed_words)
print("Lemmatized: ", lemmatized_words)
Original: ['running', 'runner', 'jumps', 'easily', 'better']
Stemmed: ['run', 'runner', 'jump', 'easili', 'better']
Lemmatized: ['run', 'runner', 'jump', 'easily', 'good']
from api import get_embedding
from utils import (
    cosine_similarity, 
    download_nltk_data, 
    preprocess_text
)
# Read user input
input_coffee_name = input("Enter a coffee name: ")
dataset_file_path = os.path.join(
    os.path.dirname(__file__),
    'data', 
    'simplified_coffee.csv'
)
# Load the CSV file into a Pandas DataFrame 
# (only the first 50 rows for this example)
df = pd.read_csv(
    dataset_file_path, 
    nrows=50
)
# Preprocess the review text
df['preprocessed_review'] = df['review'].apply(
    preprocess_text
)
# The model to use
model = "text-embedding-ada-002"
# Get the embeddings for each review
review_embeddings = []
for review in df['preprocessed_review']:
    review_embeddings.append(
        get_embedding(
            review, 
            model=model
        )
    )
# Get the index of the input coffee name
try:
    input_coffee_index = \
    df[df['name'] == \
    input_coffee_name].index[0]
except:
    print("Please enter a valid coffee name.")
    exit()
# Calculate the cosine similarity between 
# the input coffee's review and all other reviews
similarities = []
input_review_embedding = \
review_embeddings[input_coffee_index]

for review_embedding in review_embeddings:
    similarity = cosine_similarity(
        input_review_embedding, 
        review_embedding
    )
    similarities.append(similarity)
# Get the indices of the most similar reviews 
# (excluding the input coffee's review itself)
most_similar_indices = \
np.argsort(similarities)[-6:-1]
# Get the names of the most similar coffees
similar_coffee_names = df.iloc[most_similar_indices]\
['name']\
.tolist()
# Print the results
print(
    "The most similar coffees to "
    f"{input_coffee_name} are:"
)
for coffee_name in similar_coffee_names:
    print(coffee_name)
cat << EOF > src/app.py
import os
import pandas as pd
import numpy as np
from api import get_embedding
from utils import (
    cosine_similarity, 
    download_nltk_data, 
    preprocess_text
)

# Download necessary NLTK data
download_nltk_data()

dataset_file_path = os.path.join(
    os.path.dirname(__file__),
    'data', 
    'simplified_coffee.csv'
)

# Read user input
input_coffee_name = input("Enter a coffee name: ")

# Load the CSV file into a Pandas DataFrame
# (only the first 50 rows for this example)
df = pd.read_csv(
    dataset_file_path,
    nrows=50
)

# Preprocess the review text
df['preprocessed_review'] = df['review'].apply(
    preprocess_text
)

# The model to use
model = "text-embedding-ada-002"
# Get the embeddings for each review
review_embeddings = []
for review in df['preprocessed_review']:
    review_embeddings.append(
        get_embedding(
            review, 
            model=model
        )
    )

# Get the index of the input coffee name
try:
    input_coffee_index = \
    df[df['name'] == \
    input_coffee_name].index[0]
except:
    print("Please enter a valid coffee name.")
    exit()

# Calculate the cosine similarity between
# the input coffee's review and all other reviews
similarities = []
input_review_embedding = \
review_embeddings[input_coffee_index]

for review_embedding in review_embeddings:
    similarity = cosine_similarity(
        input_review_embedding, 
        review_embedding
    )
    similarities.append(similarity)

# Get the indices of the most similar reviews
# (excluding the input coffee's review itself)
most_similar_indices = \
np.argsort(similarities)[-6:-1]

# Get the names of the most similar coffees
similar_coffee_names = df.iloc[most_similar_indices]\
['name']\
.tolist()

# Print the results
print(
    "The most similar coffees to"
    f"{input_coffee_name} are:"
)
for coffee_name in similar_coffee_names:
    print(coffee_name)
EOF
Enter a coffee name: Organic Ethiopia Kirite

The most similar coffees to Organic Ethiopia Kirite are:

El Peñon Nicaragua
Colombia David Gomez 100% Caturra
Panama Auromar Estate Geisha Peaberry
Ethiopia Yirgacheffe Natural G1
Ethiopia Shakiso Mormora

Creating a "Fuzzier" Search

# get the index of the input coffee name
try:   
    # search for a coffee name in the dataframe 
    # that looks like the input coffee name 
    input_coffee_index = df[
        df['name'].str.contains(
            input_coffee_name, 
            case=False
        )
    ].index[0]
    print(
        "Found a coffee name that looks like "
        f"{df.iloc[input_coffee_index]['name']}. "
        "Using this coffee name instead."
    )
except:
    print(
        "Sorry, we don't have that coffee name in "
        "our database. Please try again."
    )
    exit()
# get the index of the input coffee name
try:   
    # search for all coffee names in the dataframe 
    # that looks like the input coffee name 
    input_coffee_indexes = df[
        df['name'].str.contains(
            input_coffee_name, 
            case=False
        )
    ].index
except:
    print(
        "Sorry, we couldn't find any coffee "
        "with that name."
    )
    exit()
# get the index of the input coffee name
try:    
    input_coffee_index = df[
        df['name'] == input_coffee_name
    ].index[0]
except IndexError:
    # get the embeddings for each name
    print(
        "Searching for a similar coffee name..."
    )
    name_embeddings = []
    for name in df['name']:
        name_embeddings.append(
            get_embedding(
                name, model=model
            )
        )
    # perform a cosine similarity search 
    # on the input coffee name
    input_coffee_embedding = get_embedding(
        input_coffee_name, 
        model=model
        )
    _similarities = []
    for name_embedding in name_embeddings:        
        _similarities.append(
            cosine_similarity(
                input_coffee_embedding, 
                name_embedding
            )
        )
    input_coffee_index = _similarities.index(
        max(_similarities)
    )
except:
    print(
        "Sorry, we don't have that coffee name "
        "in our database. Please try again."
    )
    exit()
cat << EOF > src/app.py
import os
import pandas as pd
import numpy as np
from api import get_embedding
from utils import (
    cosine_similarity, 
    download_nltk_data, 
    preprocess_text
)
# Download necessary NLTK data
download_nltk_data()

dataset_file_path = os.path.join(
    os.path.dirname(__file__),
    'data', 
    'simplified_coffee.csv'
)

# Read user input
input_coffee_name = input("Enter a coffee name: ")

# Load the CSV file into a Pandas DataFrame
# (only the first 50 rows for this example)
df = pd.read_csv(
    dataset_file_path,
    nrows=50
)

# Preprocess the review text
df['preprocessed_review'] = df['review'].apply(
    preprocess_text
)

# The model to use
model = "text-embedding-ada-002"
# Get the embeddings for each review
review_embeddings = []

for review in df['preprocessed_review']:
    review_embeddings.append(
        get_embedding(
            review, 
            model=model
        )
    )

# get the index of the input coffee name
try:    
    input_coffee_index = df[
        df['name'] == input_coffee_name
    ].index[0]
except IndexError:
    # get the embeddings for each name
    print(
        "Searching for a similar coffee name..."
    )
    name_embeddings = []
    for name in df['name']:
        name_embeddings.append(
            get_embedding(
                name, model=model
            )
        )
    # perform a cosine similarity search on the input coffee name
    input_coffee_embedding = get_embedding(
        input_coffee_name, 
        model=model
        )
    _similarities = []
    for name_embedding in name_embeddings:        
        _similarities.append(
            cosine_similarity(
                input_coffee_embedding, 
                name_embedding
            )
        )
    input_coffee_index = _similarities.index(
        max(_similarities)
    )
except:
    print(
        "Sorry, we don't have that coffee name "
        "in our database. Please try again."
    )
    exit()

# Calculate the cosine similarity between
# the input coffee's review and all other reviews
similarities = []
input_review_embedding = \
review_embeddings[input_coffee_index]

for review_embedding in review_embeddings:
    similarity = cosine_similarity(
        input_review_embedding, 
        review_embedding
    )
    similarities.append(similarity)

# Get the indices of the most similar reviews
# (excluding the input coffee's review itself)
most_similar_indices = \
np.argsort(similarities)[-6:-1]

# Get the names of the most similar coffees
similar_coffee_names = df.iloc[most_similar_indices]\
['name']\
.tolist()

# Print the results
print(
    "The most similar coffees to "
    f"{input_coffee_name} are:"
)
for coffee_name in similar_coffee_names:
    print(coffee_name)
EOF
Enter a coffee name: Ethiopian Kirite
Searching for a similar coffee name...

The most similar coffees to "Ethiopian Kirite" are:

El Peñon Nicaragua
Colombia David Gomez 100% Caturra
Panama Auromar Estate Geisha Peaberry
Ethiopia Yirgacheffe Natural G1
Ethiopia Shakiso Mormora

Predicting News Category: Zero-Shot Classification with Embeddings

from api import get_embedding
from utils import cosine_similarity

categories = [
    'U.S. NEWS', 
    'COMEDY', 
    'PARENTING', 
    'WORLD NEWS', 
    'CULTURE & ARTS', 
    'TECH', 
    'SPORTS'
    ]
# Define a function to classify a sentence
def classify_sentence(sentence, model):
    # Get the embedding of the sentence
    sentence_embedding = get_embedding(
        sentence, 
        model=model
    )
    # Calculate the similarity score 
    # between the sentence and each category
    similarity_scores = {}
    for category in categories:
        category_embeddings = get_embedding(
            category, 
            model=model
        )
        similarity_scores[
            category
        ] = cosine_similarity(
            sentence_embedding, 
            category_embeddings
        )
    # Return the category with the highest 
    # similarity score
    return max(
        similarity_scores, 
        key=similarity_scores.get
    )
# Classify a sentence
sentences = [
    "1 dead and 3 injured in El Paso, "
    "Texas, mall shooting",

    "Director Owen Kline Calls "
    "Funny Pages His ‘Self-Critical’ Debut",

    "15 spring break ideas for families "
    "that want to get away",
    
    "The US is preparing to send " 
    "more troops to the Middle East",
    
    "Bruce Willis' 'condition has progressed' "
    "to frontotemporal dementia, his family "
    "says",

    "Get an inside look at Universal’s "
    "new Super Nintendo World",

    "Barcelona 2-2 Manchester United: "
    "Marcus Rashford shines but "
    "Raphinha salvages draw for hosts",

    "Chicago bulls win the NBA championship",

    "The new iPhone 12 is now available",

    "Scientists discover a new dinosaur "
    "species",

    "The new coronavirus vaccine is now "
    "available",    

    "The new Star Wars movie is now "
    "available",

    "Amazon stock hits a new record high",
]

model = "text-embedding-ada-002"

for sentence in sentences:
    category = classify_sentence(
        sentence, 
        model=model
    )
    print(f"'{sentence[:50]}..' => {category}")
    print()
cat << EOF > src/app.py
from api import get_embedding
from utils import cosine_similarity

categories = [
    'U.S. NEWS', 
    'COMEDY', 
    'PARENTING', 
    'WORLD NEWS', 
    'CULTURE & ARTS', 
    'TECH', 
    'SPORTS'
    ]

# Define a function to classify a sentence
def classify_sentence(sentence, model):
    # Get the embedding of the sentence
    sentence_embedding = get_embedding(
        sentence, 
        model=model
    )
    # Calculate the similarity score 
    # between the sentence and each category
    similarity_scores = {}
    for category in categories:
        category_embeddings = get_embedding(
            category, 
            model=model
        )
        similarity_scores[
            category
        ] = cosine_similarity(
                sentence_embedding, 
                category_embeddings
        )
    # Return the category with the highest 
    # similarity score
    return max(
        similarity_scores, 
        key=similarity_scores.get
    )

# Classify a sentence
sentences = [
    "1 dead and 3 injured in El Paso, "
    "Texas, mall shooting",

    "Director Owen Kline Calls "
    "Funny Pages His ‘Self-Critical’ Debut",

    "15 spring break ideas for families "
    "that want to get away",
    
    "The US is preparing to send " 
    "more troops to the Middle East",
    
    "Bruce Willis' 'condition has progressed' "
    "to frontotemporal dementia, his family "
    "says",

    "Get an inside look at Universal’s "
    "new Super Nintendo World",

    "Barcelona 2-2 Manchester United: "
    "Marcus Rashford shines but "
    "Raphinha salvages draw for hosts",

    "Chicago bulls win the NBA championship",

    "The new iPhone 12 is now available",

    "Scientists discover a new dinosaur "
    "species",

    "The new coronavirus vaccine is now "
    "available",    

    "The new Star Wars movie is now "
    "available",

    "Amazon stock hits a new record high",
]

model = "text-embedding-ada-002"

for sentence in sentences:
    category = classify_sentence(
        sentence, 
        model=model
    )
    print(f"'{sentence[:50]}..' => {category}")
    print()

EOF
'1 dead and 3 injured in El Paso..' category is => WORLD NEWS

'Director Owen Kline Calls Funny..' category is => COMEDY

'15 spring break ideas for families..' category is => PARENTING

'The US is preparing to send more troops..' category is => WORLD NEWS

'Bruce Willis' 'condition has progressed'.. category is => WORLD NEWS

'Get an inside look at Universal’s new..' category is => WORLD NEWS

'Barcelona 2-2 Manchester United: Marcus..' category is => SPORTS

'Chicago bulls win the NBA championship..' category is => SPORTS

'The new iPhone 12 is now available..' category is => TECH

'Scientists discover a new dinosaur..' category is => WORLD NEWS

'The new coronavirus vaccine is now..' category is => WORLD NEWS

'The new Star Wars movie is now..' category is => WORLD NEWS

'Amazon stock hits a new record..' category is => WORLD NEWS

Evaluating the Accuracy of a Zero-Shot Classifier

cat << EOF > src/extract_categories.py
categories = set()
with open('src/data/news.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        categories.add(data['category'])
categories = list(categories)
EOF
def evaluate_precision(categories):
    # Load the dataset
    df = pd.read_json(
        "src/data/news.json", 
        lines=True
    ).head(20)

    y_true = []
    y_pred = []

    model = "text-embedding-ada-002"

    # Classify each sentence
    for _, row in df.iterrows():        
        real_category = row['category']
        predicted_category = classify_sentence(
            row['headline'],
            model=model
        )
        
        y_true.append(real_category)                
        y_pred.append(predicted_category)
        
        if real_category != predicted_category:
            print(
            "😏 Incorrect prediction: "
            f"{row['headline'][:50]}...\n"
            f"Real: {real_category[:20]}\n"
            f"Predicted: {predicted_category[:20]}"
            )
        else:
            print(
            "😀 Correct prediction: "
            f"{row['headline'][:50]}...\n"
            f"Real: {real_category[:20]}\n"
            f"Predicted: {predicted_category[:20]}"
            )

    # Calculate the precision score
    return precision_score(
        y_true, 
        y_pred, 
        average='micro', 
        labels=categories
    )
cat << EOF > src/app.py
from api import get_embedding
from utils import cosine_similarity
import pandas as pd
from sklearn.metrics import precision_score
import json

# Find all categories (unique values) in the dataset
categories = set()
with open('src/data/news.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        categories.add(data['category'])

categories = list(categories)        

# Define a function to classify a sentence
def classify_sentence(sentence, model):
    # Get the embedding of the sentence
    sentence_embedding = get_embedding(
        sentence, 
        model=model
    )
    # Calculate the similarity score 
    # between the sentence and each category
    similarity_scores = {}
    for category in categories:
        category_embeddings = get_embedding(
            category, 
            model=model
        )
        similarity_scores[
            category
        ] = cosine_similarity(
                sentence_embedding, 
                category_embeddings
        )
    # Return the category with the highest 
    # similarity score
    return max(
        similarity_scores, 
        key=similarity_scores.get
    )

def evaluate_precision(categories):
    # Load the dataset
    df = pd.read_json(
        "src/data/news.json", 
        lines=True
    ).head(20)

    y_true = []
    y_pred = []

    model = "text-embedding-ada-002"

    # Classify each sentence
    for _, row in df.iterrows():        
        real_category = row['category']
        predicted_category = classify_sentence(
            row['headline'],
            model=model
        )
        
        y_true.append(real_category)                
        y_pred.append(predicted_category)
        
        if real_category != predicted_category:
            print(
            "😏 Incorrect prediction: "
            f"{row['headline'][:50]}...\n"
            f"Real: {real_category[:20]}\n" 
            f"Predicted: {predicted_category[:20]}"
            )
        else:
            print(
            "😀 Correct prediction: "
            f"{row['headline'][:50]}...\n"
            f"Real: {real_category[:20]}\n"
            f"Predicted: {predicted_category[:20]}"
            )

    # Calculate the precision score
    return precision_score(
        y_true, 
        y_pred, 
        average='micro', 
        labels=categories
    )

# Evaluate the precision of the classifier
precision = evaluate_precision(categories)
print(f"Precision: {precision}")
EOF

Precision in Zero-Shot Classifier Applications: Examples