Fix CSV type inference in sgr csv import

mildbyte · mildbyte · commit 65e9d4364fdc · 2021-12-30T11:16:55.000Z
When using `sgr csv import`, we load data using `COPY FROM STDIN` which doesn't
let us treat empty strings as NULLs, whereas with the CSV FDW we can do that.
To get around this, we use the empty strings in the type inference in the former
case (so that integer columns with empty strings still end up as VARCHARs).
diff --git a/splitgraph/commandline/ingestion.py b/splitgraph/commandline/ingestion.py
@@ -142,7 +142,9 @@ def csv_import(
         sample = [[str(i) for i in range(len(sample))]] + sample
 
     type_overrides = dict(override_type or [])
-    sg_schema = infer_sg_schema(sample, override_types=type_overrides, primary_keys=primary_key)
+    sg_schema = infer_sg_schema(
+        sample, override_types=type_overrides, primary_keys=primary_key, ignore_empty_strings=False
+    )
     logging.debug("Using Splitgraph schema: %r", sg_schema)
 
     # Reset the stream and pass it to COPY FROM STDIN
diff --git a/splitgraph/ingestion/inference.py b/splitgraph/ingestion/inference.py
@@ -48,12 +48,12 @@ def parse_json(json_s: str):
 ]
 
 
-def _infer_column_schema(column_sample: Sequence[str]) -> str:
+def _infer_column_schema(column_sample: Sequence[str], ignore_empty_strings=True) -> str:
     for candidate, converter in _CONVERTERS:
         try:
             seen_value = False
             for c in column_sample:
-                if c == "" or c is None:
+                if (c == "" and ignore_empty_strings) or c is None:
                     continue
 
                 seen_value = True
@@ -73,6 +73,7 @@ def infer_sg_schema(
     sample: Sequence[List[str]],
     override_types: Optional[Dict[str, str]] = None,
     primary_keys: Optional[List[str]] = None,
+    ignore_empty_strings: bool = True,
 ):
     override_types = override_types or {}
     primary_keys = primary_keys or []
@@ -92,7 +93,9 @@ def infer_sg_schema(
         )
 
     for i, (c_name, c_sample) in enumerate(zip(header, columns)):
-        pg_type = override_types.get(c_name, _infer_column_schema(c_sample))
+        pg_type = override_types.get(
+            c_name, _infer_column_schema(c_sample, ignore_empty_strings=ignore_empty_strings)
+        )
 
         result.append(
             TableColumn(