Skip to content

Latest commit

 

History

History
1467 lines (1221 loc) · 31.5 KB

File metadata and controls

1467 lines (1221 loc) · 31.5 KB

Using a Vector Database with OpenAI

Introduction

What is a Vector Database?

Example 1: Using Weaviate to Make Our Model More Context-Aware

# Download the installation script
curl -fsSL https://get.docker.com \
-o get-docker.sh

# Run the installation script
sh get-docker.sh
mkvirtualenv vector_db_example
mkdir src/vector_db_example
mkdir src/vector_db_example/app
mkdir src/vector_db_example/weaviate
cat << EOF > src/vector_db_example/app/Dockerfile.app
FROM python:3.9-bookworm
WORKDIR /app
COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt
COPY . .
EXPOSE 5000
CMD [
    "flask", 
    "--app", 
    "app", 
    "run", 
    "--host=0.0.0.0", 
    "--port=5000", 
    "--debug", 
    "--reload"
]
EOF
cat << EOF > src/vector_db_example/weaviate/Dockerfile.weaviate
FROM semitechnologies/weaviate:1.23.7
EXPOSE 8080
EXPOSE 50051
CMD [
    "--host", 
    "0.0.0.0", 
    "--port", 
    "8080", 
    "--scheme", 
    "http"
]
EOF
cat << EOF > src/vector_db_example/docker-compose.yml
---
version: '3.9'

services:

  app:
    build:
      context: app
      dockerfile: Dockerfile.app
    env_file:
      - app/.env
    ports:
      - "5000:5000"
    volumes:
      - ./app:/app/

  weaviate:
    build:
      context: weaviate
      dockerfile: Dockerfile.weaviate
    ports:
    - 8080:8080
    - 50051:50051
    env_file:
      - weaviate/.env
EOF
export OPENAI_API_KEY=your-api-key
cat << EOF > src/vector_db_example/app/.env
OPENAI_API_KEY=$OPENAI_API_KEY
EOF
cat << EOF > src/vector_db_example/weaviate/.env
OPENAI_APIKEY=$OPENAI_API_KEY
# Sets the default number of objects to be returned in a query.
QUERY_DEFAULTS_LIMIT=25
# Allow users to interact with weaviate without auth
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true
# Where should Weaviate Standalone store its data?
PERSISTENCE_DATA_PATH=/var/lib/weaviate
# Default vectorizer module to use
DEFAULT_VECTORIZER_MODULE=text2vec-openai
# Which modules to enable in the setup?
ENABLE_MODULES=text2vec-openai
# Hostname of the weaviate instance
CLUSTER_HOSTNAME=node1
EOF
cat << EOF > src/vector_db_example/app/requirements.txt
annotated-types==0.6.0
anyio==4.2.0
Authlib==1.3.0
blinker==1.7.0
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cryptography==42.0.2
distro==1.9.0
exceptiongroup==1.2.0
flask==3.0.2
grpcio==1.60.1
grpcio-health-checking==1.60.1
grpcio-tools==1.60.1
h11==0.14.0
httpcore==1.0.2
httpx==0.26.0
idna==3.6
importlib-metadata==7.0.1
itsdangerous==2.1.2
Jinja2==3.1.3
MarkupSafe==2.1.5
openai==1.11.1
protobuf==4.25.2
pycparser==2.21
pydantic==2.6.1
pydantic-core==2.16.2
requests==2.31.0
sniffio==1.3.0
tqdm==4.66.1
typing-extensions==4.9.0
urllib3==2.2.0
validators==0.22.0
weaviate-client==4.4.2
werkzeug==3.0.1
zipp==3.17.0
EOF
src/vector_db_example/
├── app
│   ├── app.py
│   ├── Dockerfile.app
│   ├── .env
│   └── requirements.txt
└── weaviate
    ├── Dockerfile.weaviate
    └── .env
question = request.args.get("q")

user_prompt = {
        "role": "user",
        "content": question        
    }
context = weaviate_nearest_interactions(
    question, 
    weaviate_certainty,
    weaviate_limit,
)
latest_interactions = weaviate_latest_interactions(
    interactions_limit
)
global_context = latest_interactions["data"] + context["data"]
# Get unique values
global_context = [
    dict(t) for t in {
        tuple(d.items()) for d in global_context
        }
    ]
response = openai_client.chat.completions.create(
    model=model,
    messages=messages,
    max_tokens=200,
    temperature=1.2,
)

content = response.choices[0].message.content.strip()
assistant_prompt = {
    "role": "assistant",
    "content": content
}

data = [
    user_prompt,
    assistant_prompt
]

weaviate_save_data(
    data
)
return {
    "response": assistant_prompt["content"],
    "global_context": global_context,
}
@app.route("/ask", methods=["GET"])
def ask():
    # Get the question from the user
    question = request.args.get("q")

    # Build the user prompt
    user_prompt = {
            "role": "user",
            "content": question        
        }

    # Get the nearest context from Weaviate
    context = weaviate_nearest_interactions(
        question, 
        weaviate_certainty,
        weaviate_limit,
    )

    # Get the latest interactions from Weaviate
    latest_interactions = weaviate_latest_interactions(
        interactions_limit
    )

    # Merge the two contexts
    # and get unique values
    global_context = latest_interactions["data"] + context["data"]    
    global_context = [
        dict(t) for t in {
            tuple(d.items()) for d in global_context
            }
        ]

    # Build the message to send to OpenAI
    messages = [system_prompt] + global_context + [user_prompt]
    # Send the message to OpenAI
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=200,
        temperature=1.2,
    )

    content = response.choices[0].message.content.strip()

    # Save the user prompt and the answer in Weaviate
    assistant_prompt = {
        "role": "assistant",
        "content": content
    }

    data = [
        user_prompt,
        assistant_prompt
    ]

    weaviate_save_data(
        data
    )

    # Return the response to the user
    return {
        "response": assistant_prompt["content"],
        "global_context": global_context,
    }
def weaviate_nearest_interactions(query, certainty, limit):
    try:
        # Get the nearest context from Weaviate
        result = weaviate_client.query.get(
            class_name=weaviate_class_name,
            properties=[
                "role",
                "content"                                
            ]
        ).with_near_text({
            "concepts": [query],
            "certainty": certainty
        }).with_limit(
            limit
        ).do()
        
        return {
            "data": result['data']['Get'][weaviate_class_name]
        }

    except Exception as e:
        app.logger.error(f"Error while searching: {e}")     

def weaviate_latest_interactions(limit):
    try:
        # Get the latest interactions from Weaviate
        result = weaviate_client.query.get(
            class_name=weaviate_class_name,
            properties=[
                "role",
                "content"                                
            ]
        ).with_limit(
            limit
        ).do()
        
        return {
            "data": result['data']['Get'][weaviate_class_name]
        }

    except Exception as e:
        app.logger.error(f"Error while searching: {e}") 
schema = {
    "classes": [
        {
            "class": weaviate_class_name,
            "description": "A class to store chat messages",
            "properties": [
                {
                    "name": "content",
                    "description": "The content of the chat message",
                    "dataType": ["text"],
                },
                {
                    "name": "role",
                    "description": "The role of the message",
                    "dataType": ["string"],
                },
            ],
        }
    ]
}

def weaviate_create_schema():
    try:
        # Create the schema in Weaviate
        weaviate_client.schema.create(schema)
        app.logger.info("Schema successfully created.")
    except Exception as e:
        app.logger.error(f"Failed to create schema: {e}")
def weaviate_delete_data():
    try:
        weaviate_client.schema.delete_class(
            class_name=weaviate_class_name
        )
        app.logger.info("Data successfully reset.")
    except Exception as e:
        app.logger.error(f"Error while deleting class: {e}")
        return {"error in weaviate_reset": str(e)}, 500
weaviate_class_name = "ChatMessage"
cat << EOF > src/vector_db_example/app/app.py
import weaviate, os
from flask import Flask, request
from openai import OpenAI

app = Flask(__name__)

openai_api_key = os.getenv("OPENAI_API_KEY")
system_prompt = "You are a helpful assitant"
system_prompt ={
        "role": "system",
        "content": system_prompt
    }
model = "gpt-3.5-turbo"
weaviate_class_name = "ChatMessage"
weaviate_limit = 10
interactions_limit = 10
weaviate_certainty = 0.5

openai_client = OpenAI(
    api_key=openai_api_key
)

weaviate_client = weaviate.Client(
    url="http://weaviate:8080",
    auth_client_secret={
        "X-OpenAI-Api-Key": openai_api_key
    }
)

schema = {
    "classes": [
        {
            "class": weaviate_class_name,
            "description": 
                "A class to store chat messages",
            "properties": [
                {
                    "name": "content",
                    "description": 
                        "The content of the chat message",
                    "dataType": ["text"],
                },
                {
                    "name": "role",
                    "description": 
                        "The role of the message",
                    "dataType": ["string"],
                },
            ],
        }
    ]
}

def weaviate_create_schema():
    try:
        weaviate_client.schema.create(schema)
        app.logger.info("Schema successfully created.")
    except Exception as e:
        app.logger.error(f"Failed to create schema: {e}")

def weaviate_delete_data():
    try:
        weaviate_client.schema.delete_class(
            class_name=weaviate_class_name
        )
        app.logger.info("Data successfully reset.")
    except Exception as e:
        app.logger.error(f"Error while deleting class: {e}")
        return {"error in weaviate_reset": str(e)}, 500


weaviate_delete_data()
weaviate_create_schema()

def weaviate_nearest_interactions(query, certainty, limit):
    try:
        result = weaviate_client.query.get(
            class_name=weaviate_class_name,
            properties=[
                "role",
                "content"                                
            ]
        ).with_near_text({
            "concepts": [query],
            "certainty": certainty
        }).with_limit(
            limit
        ).do()
        
        return {
            "data": result['data']['Get'][weaviate_class_name]
        }

    except Exception as e:
        app.logger.error(f"Error while searching: {e}")     

def weaviate_latest_interactions(limit):
    try:
        result = weaviate_client.query.get(
            class_name=weaviate_class_name,
            properties=[
                "role",
                "content"                                
            ]
        ).with_limit(
            limit
        ).do()
        
        return {
            "data": result['data']['Get'][weaviate_class_name]
        }

    except Exception as e:
        app.logger.error(f"Error while searching: {e}")                   

def weaviate_save_data(data):    
    weaviate_client.batch.configure(
        batch_size=100
    )
    with weaviate_client.batch as batch:        
        for _, d in enumerate(data):
            properties = {
                "role": d["role"],
                "content": d["content"]                
            }
            batch.add_data_object(
                properties, 
                weaviate_class_name,
            )

@app.route("/ask", methods=["GET"])
def ask():
    question = request.args.get("q")

    user_prompt = {
            "role": "user",
            "content": question        
        }

    context = weaviate_nearest_interactions(
        question, 
        weaviate_certainty,
        weaviate_limit,
    )

    latest_interactions = weaviate_latest_interactions(
        interactions_limit
    )

    global_context = latest_interactions["data"] + context["data"]
    # Get unique values
    global_context = [
        dict(t) for t in {
            tuple(d.items()) for d in global_context
            }
        ]

    messages = [system_prompt] + global_context + [user_prompt]
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=200,
        temperature=1.2,
    )

    content = response.choices[0].message.content.strip()

    assistant_prompt = {
        "role": "assistant",
        "content": content
    }

    data = [
        user_prompt,
        assistant_prompt
    ]

    weaviate_save_data(
        data
    )

    return {
        "response": assistant_prompt["content"],
        "global_context": global_context,
    }
EOF
@app.route("/data", methods=["GET"])
def weaviate_get_all_data():
    try:
        result = weaviate_client.query.get(
            class_name=weaviate_class_name,
            properties=[
                "role",
                "content"                                
            ]
        ).do()

        return {
            "data": result['data']['Get'][weaviate_class_name]
        }

    except Exception as e:
        app.logger.error(f"Error while getting data: {e}")
        return {"error in weaviate_data": str(e)}, 500 
cd src/vector_db_example
docker-compose up --build
# I have a cat
curl http://127.0.0.1:5000/ask?q=I%20have%20a%20cat
# I have a dog
curl http://127.0.0.1:5000/ask?q=I%20have%20a%20dog
# How many pets do I have?
curl http://127.0.0.1:5000/ask?q=How%20many%20pets%20do%20I%20have%3F
{
  "global_context": [
    {
      "content": "I have a dog",
      "role": "user"
    },
    {
      "content": 
        "That's wonderful! Cats make great companions. "
        "Is there anything specific you need help with "
        "regarding your cat?",
      "role": "assistant"
    },
    {
      "content": 
        "That's great! Dogs are wonderful pets too. "
        "Is there anything specific you need help "
        "with regarding your dog?",
      "role": "assistant"
    },
    {
      "content": "I have a cat",
      "role": "user"
    }
  ],
  "response": 
    "Based on your previous message, you mentioned "
    "having a dog. If you have a cat in addition to "
    "a dog, then you would have two pets."
}

Example 2: Using Weaviate and OpenAI in Semantic Search

mkvirtualenv vector_db_semantic_search
mkdir src/vector_db_semantic_search
mkdir -p src/vector_db_semantic_search/app/data
mkdir src/vector_db_semantic_search/weaviate
article_class = {
    "class": weaviate_class_name,
    "description": 
        "An article from the Simple English Wikipedia data set",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        # Match how OpenAI created the embeddings 
        # for the `content` (`text`) field
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "vectorizeClassName": False
        }
    },
    "properties": [
        {
            "name": "title",
            "description": "The title of the article",
            "dataType": ["text"],
            # Don't vectorize the title
            "moduleConfig": {"text2vec-openai": {"skip": True}}
        },
        {
            "name": "content",
            "description": "The content of the article",
            "dataType": ["text"],
        }
    ]
}
def weaviate_import_data():
    # Counter to show progress on the console
    counter = 0
    interval = 100
    
    csv_iterator = pd.read_csv(
        'data/data.csv',
        usecols=[
            'id', 
            'url', 
            'title', 
            'text', 
            'content_vector'
        ],
        # number of rows per chunk
        chunksize=100,  
        # limit the number of rows to import
        nrows=100
    )   
    
    # Configure batch
    weaviate_client.batch.configure(
        batch_size=100
    )

    with weaviate_client.batch as batch:
        for chunk in csv_iterator:
            for _, row in chunk.iterrows():

                properties = {
                    "title": row.title,
                    "content": row.text,
                    "url": row.url
                }

                # Convert the vector from CSV 
                # string back to array of floats
                vector = ast.literal_eval(
                    row.content_vector
                )

                # Add the object to the batch, 
                # and set its vector embedding
                batch.add_data_object(
                    properties, 
                    class_name=weaviate_class_name,
                    vector=vector
                )

                # Calculate and display progress
                counter += 1
                if counter % interval == 0:
                    app.logger.debug(f"Imported {counter} articles...")
    app.logger.debug(f"Finished importing {counter} articles.")
@app.route("/ask", methods=["GET"])
def ask():
    question = request.args.get("q")

    context = weaviate_semantic_search(
        question, 
    )

    return {
        "response": context
    }
def weaviate_semantic_search(query):
    nearText = {
        "concepts": [query],
    }

    properties = [
        "title", 
        "content",
        "_additional {distance}"
    ]

    limit = 2

    response = weaviate_client.query.get(
        class_name=weaviate_class_name,
        properties=properties,
    ).with_near_text(
        nearText
    ).with_limit(
        limit
    ).do()

    result  = response['data']['Get'][weaviate_class_name]
    return result
cat << EOF > src/vector_db_semantic_search/app/app.py
import weaviate, os, ast
from flask import Flask, request
from openai import OpenAI
import pandas as pd

app = Flask(__name__)

openai_api_key = os.getenv("OPENAI_API_KEY")
model = "gpt-3.5-turbo"
weaviate_class_name = "Article"

openai_client = OpenAI(
    api_key=openai_api_key
)

weaviate_client = weaviate.Client(
    url="http://weaviate:8080",
    auth_client_secret={
        "X-OpenAI-Api-Key": openai_api_key
    }
)

article_class = {
    "class": weaviate_class_name,
    "description": 
        "An article from the Simple English Wikipedia data set",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        # Match how OpenAI created the embeddings 
        # for the `content` (`text`) field
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "vectorizeClassName": False
        }
    },
    "properties": [
        {
            "name": "title",
            "description": "The title of the article",
            "dataType": ["text"],
            # Don't vectorize the title
            "moduleConfig": {"text2vec-openai": {"skip": True}}
        },
        {
            "name": "content",
            "description": "The content of the article",
            "dataType": ["text"],
        }
    ]
}

def weaviate_import_data():
    counter = 0
    interval = 100
    
    csv_iterator = pd.read_csv(
        'data/data.csv',
        usecols=[
            'id', 
            'url', 
            'title', 
            'text', 
            'content_vector'
        ],
        # number of rows per chunk
        chunksize=100,  
        # limit the number of rows to import
        nrows=100
    )   
    
    # Configure batch
    weaviate_client.batch.configure(
        batch_size=100
    )

    with weaviate_client.batch as batch:
        for chunk in csv_iterator:
            for _, row in chunk.iterrows():

                properties = {
                    "title": row.title,
                    "content": row.text,
                    "url": row.url
                }

                # Convert the vector from CSV 
                # string back to array of floats
                vector = ast.literal_eval(
                    row.content_vector
                )

                # Add the object to the batch, 
                # and set its vector embedding
                batch.add_data_object(
                    properties, 
                    class_name=weaviate_class_name,
                    vector=vector
                )

                # Calculate and display progress
                counter += 1
                if counter % interval == 0:
                    app.logger.debug(f"Imported {counter} articles...")
    app.logger.debug(f"Finished importing {counter} articles.")

def weaviate_semantic_search(query):
    nearText = {
        "concepts": [query],
    }

    properties = [
        "title", 
        "content",
        "_additional {distance}"
    ]

    limit = 2

    response = weaviate_client.query.get(
        class_name=weaviate_class_name,
        properties=properties,
    ).with_near_text(
        nearText
    ).with_limit(
        limit
    ).do()

    result  = response['data']['Get'][weaviate_class_name]
    return result
    
weaviate_client.schema.delete_all()
weaviate_import_data()

@app.route("/ask", methods=["GET"])
def ask():
    question = request.args.get("q")

    context = weaviate_semantic_search(
        question, 
    )

    return {
        "response": context
    }
EOF
cd src/vector_db_semantic_search
docker-compose up --build
# search query: politics
curl http://0.0.0.0:5000/ask?q=politics
# search query: science
curl http://0.0.0.0:5000/ask?q=science
# search query: cats
curl http://0.0.0.0:5000/ask?q=cats

Example 3: Using Weaviate and OpenAI for Generative Search

cat << EOF > src/vector_db_generative_search/weaviate/.env
OPENAI_APIKEY=$OPENAI_API_KEY
QUERY_DEFAULTS_LIMIT=25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true
PERSISTENCE_DATA_PATH=/var/lib/weaviate
DEFAULT_VECTORIZER_MODULE=text2vec-openai
ENABLE_MODULES=text2vec-openai,generative-openai
CLUSTER_HOSTNAME=node1
EOF
prompt = """Rewrite in the style of Shakespeare:
{content}
"""

def weaviate_semantic_search(query, prompt):
    nearText = {
        "concepts": [query],
    }

    properties = [
        "title", 
        "content",
        "_additional {distance}"
    ]

    limit = 1

    response = weaviate_client.query.get(
        class_name=weaviate_class_name,
        properties=properties,
    ).with_generate(
        single_prompt=prompt
    ).with_near_text(
        nearText
    ).with_limit(
        limit
    ).do()

    result  = response['data']['Get'][weaviate_class_name]
    return result
prompt = """Translate to Welsh:
{content}
"""
mkdir -p src/vector_db_generative_search/app
..etc
cat << 'EOT' > src/vector_db_generative_search/app/app.py
import weaviate, os, ast
from flask import Flask, request
from openai import OpenAI
import pandas as pd

app = Flask(__name__)

openai_api_key = os.getenv("OPENAI_API_KEY")
model = "gpt-3.5-turbo"
weaviate_class_name = "Article"

openai_client = OpenAI(
    api_key=openai_api_key
)

weaviate_client = weaviate.Client(
    url="http://weaviate:8080",
    auth_client_secret={
        "X-OpenAI-Api-Key": openai_api_key
    }
)

article_class = {
    "class": weaviate_class_name,
    "description": 
        "An article from the Simple English Wikipedia data set",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        # Match how OpenAI created the embeddings 
        # for the `content` (`text`) field
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "vectorizeClassName": False
        }
    },
    "properties": [
        {
            "name": "title",
            "description": "The title of the article",
            "dataType": ["text"],
            # Don't vectorize the title
            "moduleConfig": {"text2vec-openai": {"skip": True}}
        },
        {
            "name": "content",
            "description": "The content of the article",
            "dataType": ["text"],
        }
    ]
}

def weaviate_import_data():
    counter = 0
    interval = 100
    
    csv_iterator = pd.read_csv(
        'data/data.csv',
        usecols=[
            'id', 
            'url', 
            'title', 
            'text', 
            'content_vector'
        ],
        # number of rows per chunk
        chunksize=100,  
        # limit the number of rows to import
        nrows=100
    )   
    
    # Configure batch
    weaviate_client.batch.configure(
        batch_size=100
    )

    with weaviate_client.batch as batch:
        for chunk in csv_iterator:
            for _, row in chunk.iterrows():

                properties = {
                    "title": row.title,
                    "content": row.text,
                    "url": row.url
                }

                # Convert the vector from CSV 
                # string back to array of floats
                vector = ast.literal_eval(
                    row.content_vector
                )

                # Add the object to the batch, 
                # and set its vector embedding
                batch.add_data_object(
                    properties, 
                    class_name=weaviate_class_name,
                    vector=vector
                )

                # Calculate and display progress
                counter += 1
                if counter % interval == 0:
                    app.logger.debug(f"Imported {counter} articles...")
    app.logger.debug(f"Finished importing {counter} articles.")

def weaviate_semantic_search(query, prompt):
    nearText = {
        "concepts": [query],
    }

    properties = [
        "title", 
        "content",
        "_additional {distance}"
    ]

    limit = 1

    response = weaviate_client.query.get(
        class_name=weaviate_class_name,
        properties=properties,
    ).with_generate(
        single_prompt=prompt
    ).with_near_text(
        nearText
    ).with_limit(
        limit
    ).do()

    result  = response['data']['Get'][weaviate_class_name]
    return result
    
weaviate_client.schema.delete_all()
weaviate_import_data()

@app.route("/ask", methods=["GET"])
def ask():
    question = request.args.get("q")
    prompt = """Rewrite in the style of Shakespeare:
    {content}
    """

    context = weaviate_semantic_search(
        question, 
        prompt
    )

    return {
        "response": context
    }
EOT
cd src/vector_db_generative_search
docker-compose up --build
# search query: politics
curl http://0.0.0.0:5000/ask?q=politics
# search query: science
curl http://0.0.0.0:5000/ask?q=science
# search query: cats
curl http://0.0.0.0:5000/ask?q=cats
    response = weaviate_client.query.get(
        class_name=weaviate_class_name,
        properties=properties,
    ).with_generate(
        grouped_task=prompt
    ).with_near_text(
        nearText
    ).with_limit(
        limit
    ).do()
prompt = """Extract the list of topics discussed in these articles:
{content}
"""
cat << 'EOT' > src/vector_db_generative_search/app/app.py
import weaviate, os, ast
from flask import Flask, request
from openai import OpenAI
import pandas as pd

app = Flask(__name__)

openai_api_key = os.getenv("OPENAI_API_KEY")
model = "gpt-3.5-turbo"
weaviate_class_name = "Article"

openai_client = OpenAI(
    api_key=openai_api_key
)

weaviate_client = weaviate.Client(
    url="http://weaviate:8080",
    auth_client_secret={
        "X-OpenAI-Api-Key": openai_api_key
    }
)

article_class = {
    "class": weaviate_class_name,
    "description": 
        "An article from the Simple English Wikipedia data set",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        # Match how OpenAI created the embeddings 
        # for the `content` (`text`) field
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "vectorizeClassName": False
        }
    },
    "properties": [
        {
            "name": "title",
            "description": "The title of the article",
            "dataType": ["text"],
            # Don't vectorize the title
            "moduleConfig": {"text2vec-openai": {"skip": True}}
        },
        {
            "name": "content",
            "description": "The content of the article",
            "dataType": ["text"],
        }
    ]
}

def weaviate_import_data():
    counter = 0
    interval = 100
    
    csv_iterator = pd.read_csv(
        'data/data.csv',
        usecols=[
            'id', 
            'url', 
            'title', 
            'text', 
            'content_vector'
        ],
        # number of rows per chunk
        chunksize=100,  
        # limit the number of rows to import
        nrows=100
    )   
    
    # Configure batch
    weaviate_client.batch.configure(
        batch_size=100
    )

    with weaviate_client.batch as batch:
        for chunk in csv_iterator:
            for _, row in chunk.iterrows():

                properties = {
                    "title": row.title,
                    "content": row.text,
                    "url": row.url
                }

                # Convert the vector from CSV 
                # string back to array of floats
                vector = ast.literal_eval(
                    row.content_vector
                )

                # Add the object to the batch, 
                # and set its vector embedding
                batch.add_data_object(
                    properties, 
                    class_name=weaviate_class_name,
                    vector=vector
                )

                # Calculate and display progress
                counter += 1
                if counter % interval == 0:
                    app.logger.debug(f"Imported {counter} articles...")
    app.logger.debug(f"Finished importing {counter} articles.")

def weaviate_semantic_search(query, prompt):
    nearText = {
        "concepts": [query],
    }

    properties = [
        "title", 
        "content",
        "_additional {distance}"
    ]

    limit = 3

    response = weaviate_client.query.get(
        class_name=weaviate_class_name,
        properties=properties,
    ).with_generate(
        grouped_task=prompt
    ).with_near_text(
        nearText
    ).with_limit(
        limit
    ).do()

    result  = response['data']['Get'][weaviate_class_name]
    return result
    
weaviate_client.schema.delete_all()
weaviate_import_data()

@app.route("/ask", methods=["GET"])
def ask():
    question = request.args.get("q")
    prompt = """Extract the list of topics discussed in these articles:
    {content}
    """

    context = weaviate_semantic_search(
        question, 
        prompt
    )

    return {
        "response": context
    }
EOT
{
  "response": [
    {
      "_additional": {
        "distance": 0.17569739,
        "generate": {
          "error": null,
          "groupedResult": "The list of topics discussed in these articles are:\n\n1. Animals (zoology, palaeontology, cellular respiration, metabolism, cell membranes)\n2. Plants (multicellular eukaryotic organisms)\n3. Grouping animals..."
        }
      },
      "content": "Animals (or Metazoa) are living creatures with many cells...",
      "title": "Animal"
    },
    {
      "_additional": {
        "distance": 0.21414834,
        "generate": null
      },
      "content": "A browser is a name given to any animal, usually a herbivorous mammal ..",
      "title": "Browser"
    },
    {
      "_additional": {
        "distance": 0.22182149,
        "generate": null
      },
      "content": "Being is also a present tense part of to be..",
      "title": "Being"
    }
  ]
}
prompt = """You are a helpful assistant. You are having a discussion with a user. This is the context of the discussion:
{content}
"""