Dispatches api based on dbms type

sbadithe · sbadithe · commit c6c609788a12 · 2022-07-18T12:19:51.000-07:00
diff --git a/featuretools-sql/__pycache__/connector.cpython-38.pyc b/featuretools-sql/__pycache__/connector.cpython-38.pyc
diff --git a/featuretools-sql/connector.py b/featuretools-sql/connector.py
@@ -1,10 +1,16 @@
-import warnings
-
+from collections import namedtuple 
 import connectorx as cx
 import pandas as pd
 
-
 class DBConnector:
+    Relationship = namedtuple('Relationship', ['referenced_table_name', 'referenced_column_name', 'table_name', 'col_name'])
+    
+    database_to_API = {
+        "postgres": "ConnectorX",
+        "mysql": "ConnectorX"
+    }
+    supported_databases = ["postgres", "mysql"] 
+
     def __init__(
         self, system_name: str, user: str, password: str, host: str, database: str
     ):
@@ -15,14 +21,21 @@ def __init__(
             "host": host,
             "database": database,
         }
+
+        #TODO: Password security 
         if None in [user, password, host, database]:
             raise ValueError("Cannot pass None as argument to DBConnector constructor")
-        self.connection_string = f"{system_name}://{user}:{password}@{host}/{database}"
-
+        if database not in DBConnector.supported_databases: 
+            raise NotImplementedError(f"DBConnector does not currently support {database}")
+        self.connection_string = f"{system_name}://{user}:{password}@{host}/{database}" 
         self.relationships = []
         self.tables = []
         self.dataframes = dict()
 
+    @classmethod 
+    def learn_supported_databases(cls) -> list[str]: 
+        return cls.supported_databases
+
     def change_system_name(self, system_name: str):
         self.config["system_name"] = system_name
 
@@ -55,7 +68,6 @@ def get_primary_key_from_table(self, table: str) -> pd.DataFrame:
         df = self.run_query(
             f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{db}' AND TABLE_NAME = '{table}' AND COLUMN_KEY = 'PRI';"
         )
-        warnings.warn("Cannot handle composite keys yet!")
         return df["COLUMN_NAME"]
 
     def populate_dataframes(self, debug=False):
@@ -79,6 +91,7 @@ def populate_dataframes(self, debug=False):
         return
 
     def populate_relationships(self, debug=False):
+        self.relationships = [] 
         query_str = f"SELECT TABLE_NAME, COLUMN_NAME, CONSTRAINT_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE REFERENCED_TABLE_SCHEMA = '{self.config['database']}'"
         foreign_keys = self.run_query(query_str)
         for (
@@ -88,17 +101,17 @@ def populate_relationships(self, debug=False):
             referenced_table_name,
             referenced_column_name,
         ) in foreign_keys.values:
-            rel_tuple = (
+            r = DBConnector.Relationship(
                 referenced_table_name,
                 referenced_column_name,
                 table_name,
                 col_name,
             )
-            self.relationships.append(rel_tuple)
-        return
+            self.relationships.append(r)
+
 
     def run_query(self, query: str) -> pd.DataFrame:
         if not isinstance(query, str):
             raise ValueError(f"Query must be of string type, not {type(query)}")
-        df = cx.read_sql(self.connection_string, query)
-        return df
+        if DBConnector.database_to_API[self.config["database"]] == "ConnectorX": 
+            return cx.read_sql(self.connection_string, query)
diff --git a/featuretools-sql/example.ipynb b/featuretools-sql/example.ipynb
@@ -0,0 +1,223 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-07-15 12:00:25,286 featuretools - WARNING    While loading primitives via \"premium_primitives\" entry point, ignored primitive \"PartOfDay\" from \"premium_primitives.part_of_day\" because a primitive with that name already exists in \"featuretools.primitives.standard.datetime_transform_primitives\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "from connector import DBConnector\n",
+    "from featuretools import EntitySet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\nMany products can map to the same transaction \\n'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "+------------+--------------+\n",
+    "| product_id | product_name |\n",
+    "+------------+--------------+\n",
+    "|          1 | Car          |\n",
+    "|          2 | Truck        |\n",
+    "|          3 | Plane        |\n",
+    "+------------+--------------+\n",
+    "PRODUCTS TABLE \n",
+    "\"\"\"\n",
+    "\n",
+    "\"\"\"\n",
+    "+----------------+------------+\n",
+    "| transaction_id | product_id |\n",
+    "+----------------+------------+\n",
+    "|              0 |          1 |\n",
+    "|              1 |          1 |\n",
+    "|              3 |          1 |\n",
+    "|              4 |          2 |\n",
+    "|              2 |          3 |\n",
+    "|              5 |          3 |\n",
+    "+----------------+------------+\n",
+    "TRANSACTIONS TABLE \n",
+    "\"\"\"\n",
+    "\n",
+    "\"\"\"\n",
+    "Many products can map to the same transaction \n",
+    "\"\"\" "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "     TABLE_NAME\n",
+      "0      products\n",
+      "1  transactions\n"
+     ]
+    }
+   ],
+   "source": [
+    "config = dict()\n",
+    "config[\"system_name\"] = \"mysql\"\n",
+    "config[\"host\"] = \"127.0.0.1:3306\"\n",
+    "config[\"password\"] = \"harrypotter\"\n",
+    "config[\"user\"] = \"root\"\n",
+    "config[\"database\"] = \"dummy\"\n",
+    "\n",
+    "sql_connector = DBConnector(**config) \n",
+    "tables = sql_connector.all_tables()\n",
+    "print(f\"{type(tables)}\")\n",
+    "print(tables) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'dict'>\n",
+      "products\n",
+      "   product_id product_name\n",
+      "0           1          Car\n",
+      "1           2        Truck\n",
+      "2           3        Plane\n",
+      "product_id\n",
+      "transactions\n",
+      "   transaction_id  product_id\n",
+      "0               0           1\n",
+      "1               1           1\n",
+      "2               3           1\n",
+      "3               4           2\n",
+      "4               2           3\n",
+      "5               5           3\n",
+      "transaction_id\n"
+     ]
+    }
+   ],
+   "source": [
+    "sql_connector.populate_dataframes(debug=False) \n",
+    "print(type(sql_connector.dataframes))\n",
+    "for name, df in sql_connector.dataframes.items(): \n",
+    "    print(name) \n",
+    "    print(df[0]) \n",
+    "    print(df[1])  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "referenced_table_name : products\n",
+      "referenced_column_name : product_id\n",
+      "table_name : transactions\n",
+      "col_name : product_id\n"
+     ]
+    }
+   ],
+   "source": [
+    "sql_connector.populate_relationships(debug=False) \n",
+    "for rel_tuple in sql_connector.relationships:   \n",
+    "    for field, val in rel_tuple._asdict().items(): \n",
+    "        print(f\"{field} : {val}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Entityset: es\n",
+      "  DataFrames:\n",
+      "    products [Rows: 3, Columns: 2]\n",
+      "    transactions [Rows: 6, Columns: 2]\n",
+      "  Relationships:\n",
+      "    transactions.product_id -> products.product_id\n"
+     ]
+    }
+   ],
+   "source": [
+    "es = EntitySet(\"es\", sql_connector.dataframes, sql_connector.relationships) \n",
+    "print(es) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.12 64-bit ('venv_x86')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "3f6b062a214ec48d1657976024d6bc68979519d14a33afb6ad033fc2e4189514"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}