Skip to content

Commit 65e9d43

Browse files
committed
Fix CSV type inference in sgr csv import
When using `sgr csv import`, we load data using `COPY FROM STDIN` which doesn't let us treat empty strings as NULLs, whereas with the CSV FDW we can do that. To get around this, we use the empty strings in the type inference in the former case (so that integer columns with empty strings still end up as VARCHARs).
1 parent dfdb892 commit 65e9d43

2 files changed

Lines changed: 9 additions & 4 deletions

File tree

splitgraph/commandline/ingestion.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,9 @@ def csv_import(
142142
sample = [[str(i) for i in range(len(sample))]] + sample
143143

144144
type_overrides = dict(override_type or [])
145-
sg_schema = infer_sg_schema(sample, override_types=type_overrides, primary_keys=primary_key)
145+
sg_schema = infer_sg_schema(
146+
sample, override_types=type_overrides, primary_keys=primary_key, ignore_empty_strings=False
147+
)
146148
logging.debug("Using Splitgraph schema: %r", sg_schema)
147149

148150
# Reset the stream and pass it to COPY FROM STDIN

splitgraph/ingestion/inference.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ def parse_json(json_s: str):
4848
]
4949

5050

51-
def _infer_column_schema(column_sample: Sequence[str]) -> str:
51+
def _infer_column_schema(column_sample: Sequence[str], ignore_empty_strings=True) -> str:
5252
for candidate, converter in _CONVERTERS:
5353
try:
5454
seen_value = False
5555
for c in column_sample:
56-
if c == "" or c is None:
56+
if (c == "" and ignore_empty_strings) or c is None:
5757
continue
5858

5959
seen_value = True
@@ -73,6 +73,7 @@ def infer_sg_schema(
7373
sample: Sequence[List[str]],
7474
override_types: Optional[Dict[str, str]] = None,
7575
primary_keys: Optional[List[str]] = None,
76+
ignore_empty_strings: bool = True,
7677
):
7778
override_types = override_types or {}
7879
primary_keys = primary_keys or []
@@ -92,7 +93,9 @@ def infer_sg_schema(
9293
)
9394

9495
for i, (c_name, c_sample) in enumerate(zip(header, columns)):
95-
pg_type = override_types.get(c_name, _infer_column_schema(c_sample))
96+
pg_type = override_types.get(
97+
c_name, _infer_column_schema(c_sample, ignore_empty_strings=ignore_empty_strings)
98+
)
9699

97100
result.append(
98101
TableColumn(

0 commit comments

Comments
 (0)