Skip to content

Commit c6c6097

Browse files
committed
Dispatches api based on dbms type
1 parent cc852e3 commit c6c6097

3 files changed

Lines changed: 247 additions & 11 deletions

File tree

28 Bytes
Binary file not shown.

featuretools-sql/connector.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
1-
import warnings
2-
1+
from collections import namedtuple
32
import connectorx as cx
43
import pandas as pd
54

6-
75
class DBConnector:
6+
Relationship = namedtuple('Relationship', ['referenced_table_name', 'referenced_column_name', 'table_name', 'col_name'])
7+
8+
database_to_API = {
9+
"postgres": "ConnectorX",
10+
"mysql": "ConnectorX"
11+
}
12+
supported_databases = ["postgres", "mysql"]
13+
814
def __init__(
915
self, system_name: str, user: str, password: str, host: str, database: str
1016
):
@@ -15,14 +21,21 @@ def __init__(
1521
"host": host,
1622
"database": database,
1723
}
24+
25+
#TODO: Password security
1826
if None in [user, password, host, database]:
1927
raise ValueError("Cannot pass None as argument to DBConnector constructor")
20-
self.connection_string = f"{system_name}://{user}:{password}@{host}/{database}"
21-
28+
if database not in DBConnector.supported_databases:
29+
raise NotImplementedError(f"DBConnector does not currently support {database}")
30+
self.connection_string = f"{system_name}://{user}:{password}@{host}/{database}"
2231
self.relationships = []
2332
self.tables = []
2433
self.dataframes = dict()
2534

35+
@classmethod
36+
def learn_supported_databases(cls) -> list[str]:
37+
return cls.supported_databases
38+
2639
def change_system_name(self, system_name: str):
2740
self.config["system_name"] = system_name
2841

@@ -55,7 +68,6 @@ def get_primary_key_from_table(self, table: str) -> pd.DataFrame:
5568
df = self.run_query(
5669
f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{db}' AND TABLE_NAME = '{table}' AND COLUMN_KEY = 'PRI';"
5770
)
58-
warnings.warn("Cannot handle composite keys yet!")
5971
return df["COLUMN_NAME"]
6072

6173
def populate_dataframes(self, debug=False):
@@ -79,6 +91,7 @@ def populate_dataframes(self, debug=False):
7991
return
8092

8193
def populate_relationships(self, debug=False):
94+
self.relationships = []
8295
query_str = f"SELECT TABLE_NAME, COLUMN_NAME, CONSTRAINT_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE REFERENCED_TABLE_SCHEMA = '{self.config['database']}'"
8396
foreign_keys = self.run_query(query_str)
8497
for (
@@ -88,17 +101,17 @@ def populate_relationships(self, debug=False):
88101
referenced_table_name,
89102
referenced_column_name,
90103
) in foreign_keys.values:
91-
rel_tuple = (
104+
r = DBConnector.Relationship(
92105
referenced_table_name,
93106
referenced_column_name,
94107
table_name,
95108
col_name,
96109
)
97-
self.relationships.append(rel_tuple)
98-
return
110+
self.relationships.append(r)
111+
99112

100113
def run_query(self, query: str) -> pd.DataFrame:
101114
if not isinstance(query, str):
102115
raise ValueError(f"Query must be of string type, not {type(query)}")
103-
df = cx.read_sql(self.connection_string, query)
104-
return df
116+
if DBConnector.database_to_API[self.config["database"]] == "ConnectorX":
117+
return cx.read_sql(self.connection_string, query)

featuretools-sql/example.ipynb

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stderr",
10+
"output_type": "stream",
11+
"text": [
12+
"WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n"
13+
]
14+
},
15+
{
16+
"name": "stdout",
17+
"output_type": "stream",
18+
"text": [
19+
"2022-07-15 12:00:25,286 featuretools - WARNING While loading primitives via \"premium_primitives\" entry point, ignored primitive \"PartOfDay\" from \"premium_primitives.part_of_day\" because a primitive with that name already exists in \"featuretools.primitives.standard.datetime_transform_primitives\"\n"
20+
]
21+
}
22+
],
23+
"source": [
24+
"from connector import DBConnector\n",
25+
"from featuretools import EntitySet"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": 18,
31+
"metadata": {},
32+
"outputs": [
33+
{
34+
"data": {
35+
"text/plain": [
36+
"'\\nMany products can map to the same transaction \\n'"
37+
]
38+
},
39+
"execution_count": 18,
40+
"metadata": {},
41+
"output_type": "execute_result"
42+
}
43+
],
44+
"source": [
45+
"\"\"\"\n",
46+
"+------------+--------------+\n",
47+
"| product_id | product_name |\n",
48+
"+------------+--------------+\n",
49+
"| 1 | Car |\n",
50+
"| 2 | Truck |\n",
51+
"| 3 | Plane |\n",
52+
"+------------+--------------+\n",
53+
"PRODUCTS TABLE \n",
54+
"\"\"\"\n",
55+
"\n",
56+
"\"\"\"\n",
57+
"+----------------+------------+\n",
58+
"| transaction_id | product_id |\n",
59+
"+----------------+------------+\n",
60+
"| 0 | 1 |\n",
61+
"| 1 | 1 |\n",
62+
"| 3 | 1 |\n",
63+
"| 4 | 2 |\n",
64+
"| 2 | 3 |\n",
65+
"| 5 | 3 |\n",
66+
"+----------------+------------+\n",
67+
"TRANSACTIONS TABLE \n",
68+
"\"\"\"\n",
69+
"\n",
70+
"\"\"\"\n",
71+
"Many products can map to the same transaction \n",
72+
"\"\"\" "
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": 6,
78+
"metadata": {},
79+
"outputs": [
80+
{
81+
"name": "stdout",
82+
"output_type": "stream",
83+
"text": [
84+
"<class 'pandas.core.frame.DataFrame'>\n",
85+
" TABLE_NAME\n",
86+
"0 products\n",
87+
"1 transactions\n"
88+
]
89+
}
90+
],
91+
"source": [
92+
"config = dict()\n",
93+
"config[\"system_name\"] = \"mysql\"\n",
94+
"config[\"host\"] = \"127.0.0.1:3306\"\n",
95+
"config[\"password\"] = \"harrypotter\"\n",
96+
"config[\"user\"] = \"root\"\n",
97+
"config[\"database\"] = \"dummy\"\n",
98+
"\n",
99+
"sql_connector = DBConnector(**config) \n",
100+
"tables = sql_connector.all_tables()\n",
101+
"print(f\"{type(tables)}\")\n",
102+
"print(tables) "
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": 7,
108+
"metadata": {},
109+
"outputs": [
110+
{
111+
"name": "stdout",
112+
"output_type": "stream",
113+
"text": [
114+
"<class 'dict'>\n",
115+
"products\n",
116+
" product_id product_name\n",
117+
"0 1 Car\n",
118+
"1 2 Truck\n",
119+
"2 3 Plane\n",
120+
"product_id\n",
121+
"transactions\n",
122+
" transaction_id product_id\n",
123+
"0 0 1\n",
124+
"1 1 1\n",
125+
"2 3 1\n",
126+
"3 4 2\n",
127+
"4 2 3\n",
128+
"5 5 3\n",
129+
"transaction_id\n"
130+
]
131+
}
132+
],
133+
"source": [
134+
"sql_connector.populate_dataframes(debug=False) \n",
135+
"print(type(sql_connector.dataframes))\n",
136+
"for name, df in sql_connector.dataframes.items(): \n",
137+
" print(name) \n",
138+
" print(df[0]) \n",
139+
" print(df[1]) "
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": 14,
145+
"metadata": {},
146+
"outputs": [
147+
{
148+
"name": "stdout",
149+
"output_type": "stream",
150+
"text": [
151+
"referenced_table_name : products\n",
152+
"referenced_column_name : product_id\n",
153+
"table_name : transactions\n",
154+
"col_name : product_id\n"
155+
]
156+
}
157+
],
158+
"source": [
159+
"sql_connector.populate_relationships(debug=False) \n",
160+
"for rel_tuple in sql_connector.relationships: \n",
161+
" for field, val in rel_tuple._asdict().items(): \n",
162+
" print(f\"{field} : {val}\")"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 15,
168+
"metadata": {},
169+
"outputs": [
170+
{
171+
"name": "stdout",
172+
"output_type": "stream",
173+
"text": [
174+
"Entityset: es\n",
175+
" DataFrames:\n",
176+
" products [Rows: 3, Columns: 2]\n",
177+
" transactions [Rows: 6, Columns: 2]\n",
178+
" Relationships:\n",
179+
" transactions.product_id -> products.product_id\n"
180+
]
181+
}
182+
],
183+
"source": [
184+
"es = EntitySet(\"es\", sql_connector.dataframes, sql_connector.relationships) \n",
185+
"print(es) "
186+
]
187+
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": null,
191+
"metadata": {},
192+
"outputs": [],
193+
"source": []
194+
}
195+
],
196+
"metadata": {
197+
"kernelspec": {
198+
"display_name": "Python 3.8.12 64-bit ('venv_x86')",
199+
"language": "python",
200+
"name": "python3"
201+
},
202+
"language_info": {
203+
"codemirror_mode": {
204+
"name": "ipython",
205+
"version": 3
206+
},
207+
"file_extension": ".py",
208+
"mimetype": "text/x-python",
209+
"name": "python",
210+
"nbconvert_exporter": "python",
211+
"pygments_lexer": "ipython3",
212+
"version": "3.8.12"
213+
},
214+
"orig_nbformat": 4,
215+
"vscode": {
216+
"interpreter": {
217+
"hash": "3f6b062a214ec48d1657976024d6bc68979519d14a33afb6ad033fc2e4189514"
218+
}
219+
}
220+
},
221+
"nbformat": 4,
222+
"nbformat_minor": 2
223+
}

0 commit comments

Comments
 (0)