Skip to content

Tabsyn Sampling Fails When Dataset Contains Only Categorical Features #29

@kirilzilla

Description

@kirilzilla

Description:

I encountered an error during the sampling phase using the tabsyn method. The issue seems to be related to missing numerical features during preprocessing.

Steps to Reproduce:

  1. Use the following insurance.json configuration:
    It has 25 category columns and no numerical columns, the first column is the target column 0 or 1.

    {
        "name": "insurance",
        "task_type": "binclass",
        "header": "infer",
        "column_names": [
            "GoodStudent", "Age", "SocioEcon", "RiskAversion", "VehicleYear", 
            "RuggedAuto", "MakeModel", "DrivQuality", "Mileage", "Antilock", 
            "DrivingSkill", "SeniorTrain", "ThisCarCost", "Theft", "CarValue", 
            "HomeBase", "AntiTheft", "PropCost", "OtherCarCost", "OtherCar", 
            "MedCost", "Cushioning", "Airbag", "ILiCost", "DrivHist"
        ],
        "num_col_idx": [],
        "cat_col_idx": [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
            11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
            21, 22, 23, 24
        ],
        "target_col_idx": [0],
        "file_type": "csv",
        "data_path": "data/insurance/insurance.csv",
        "test_path": null
    }
    
  2. Preprocess and train the model successfully using VAE and tabsyn.

Preprocessing output shows a numerical vector, although there are no numerical features.
Numerical (18000, 0)
Categorical (18000, 24)

  • I think this might be a mistake. It should be NO numerical columns and 25 categorical columns instead auf 24.
  1. Run the sampling command:
    python main.py --dataname insurance --method tabsyn --mode sample

During the sampling step, the following error is thrown:

ValueError: Found array with 0 feature(s) (shape=(18000, 0)) while a minimum of 1 is required by QuantileTransformer.

This is the processed JSON in tabsyn/data/insurance/info.json

{
 "name": "insurance",
 "task_type": "binclass",
 "header": "infer",
 "column_names": [
     "GoodStudent",
     "Age",
     "SocioEcon",
     "RiskAversion",
     "VehicleYear",
     "RuggedAuto",
     "MakeModel",
     "DrivQuality",
     "Mileage",
     "Antilock",
     "DrivingSkill",
     "SeniorTrain",
     "ThisCarCost",
     "Theft",
     "CarValue",
     "HomeBase",
     "AntiTheft",
     "PropCost",
     "OtherCarCost",
     "OtherCar",
     "MedCost",
     "Cushioning",
     "Airbag",
     "ILiCost",
     "DrivHist"
 ],
 "num_col_idx": [],
 "cat_col_idx": [
     1,
     2,
     3,
     4,
     5,
     6,
     7,
     8,
     9,
     10,
     11,
     12,
     13,
     14,
     15,
     16,
     17,
     18,
     19,
     20,
     21,
     22,
     23,
     24
 ],
 "target_col_idx": [
     0
 ],
 "file_type": "csv",
 "data_path": "data/insurance/insurance.csv",
 "test_path": null,
 "column_info": {
     "1": {},
     "type": "categorical",
     "categorizes": [
         0,
         1
     ],
     "2": {},
     "3": {},
     "4": {},
     "5": {},
     "6": {},
     "7": {},
     "8": {},
     "9": {},
     "10": {},
     "11": {},
     "12": {},
     "13": {},
     "14": {},
     "15": {},
     "16": {},
     "17": {},
     "18": {},
     "19": {},
     "20": {},
     "21": {},
     "22": {},
     "23": {},
     "24": {},
     "0": {}
 },
 "train_num": 18000,
 "test_num": 2000,
 "idx_mapping": {
     "0": 24,
     "1": 0,
     "2": 1,
     "3": 2,
     "4": 3,
     "5": 4,
     "6": 5,
     "7": 6,
     "8": 7,
     "9": 8,
     "10": 9,
     "11": 10,
     "12": 11,
     "13": 12,
     "14": 13,
     "15": 14,
     "16": 15,
     "17": 16,
     "18": 17,
     "19": 18,
     "20": 19,
     "21": 20,
     "22": 21,
     "23": 22,
     "24": 23
 },
 "inverse_idx_mapping": {
     "24": 0,
     "0": 1,
     "1": 2,
     "2": 3,
     "3": 4,
     "4": 5,
     "5": 6,
     "6": 7,
     "7": 8,
     "8": 9,
     "9": 10,
     "10": 11,
     "11": 12,
     "12": 13,
     "13": 14,
     "14": 15,
     "15": 16,
     "16": 17,
     "17": 18,
     "18": 19,
     "19": 20,
     "20": 21,
     "21": 22,
     "22": 23,
     "23": 24
 },
 "idx_name_mapping": {
     "0": "GoodStudent",
     "1": "Age",
     "2": "SocioEcon",
     "3": "RiskAversion",
     "4": "VehicleYear",
     "5": "RuggedAuto",
     "6": "MakeModel",
     "7": "DrivQuality",
     "8": "Mileage",
     "9": "Antilock",
     "10": "DrivingSkill",
     "11": "SeniorTrain",
     "12": "ThisCarCost",
     "13": "Theft",
     "14": "CarValue",
     "15": "HomeBase",
     "16": "AntiTheft",
     "17": "PropCost",
     "18": "OtherCarCost",
     "19": "OtherCar",
     "20": "MedCost",
     "21": "Cushioning",
     "22": "Airbag",
     "23": "ILiCost",
     "24": "DrivHist"
 },
 "metadata": {
     "columns": {
         "1": {
             "sdtype": "categorical"
         },
         "2": {
             "sdtype": "categorical"
         },
         "3": {
             "sdtype": "categorical"
         },
         "4": {
             "sdtype": "categorical"
         },
         "5": {
             "sdtype": "categorical"
         },
         "6": {
             "sdtype": "categorical"
         },
         "7": {
             "sdtype": "categorical"
         },
         "8": {
             "sdtype": "categorical"
         },
         "9": {
             "sdtype": "categorical"
         },
         "10": {
             "sdtype": "categorical"
         },
         "11": {
             "sdtype": "categorical"
         },
         "12": {
             "sdtype": "categorical"
         },
         "13": {
             "sdtype": "categorical"
         },
         "14": {
             "sdtype": "categorical"
         },
         "15": {
             "sdtype": "categorical"
         },
         "16": {
             "sdtype": "categorical"
         },
         "17": {
             "sdtype": "categorical"
         },
         "18": {
             "sdtype": "categorical"
         },
         "19": {
             "sdtype": "categorical"
         },
         "20": {
             "sdtype": "categorical"
         },
         "21": {
             "sdtype": "categorical"
         },
         "22": {
             "sdtype": "categorical"
         },
         "23": {
             "sdtype": "categorical"
         },
         "24": {
             "sdtype": "categorical"
         },
         "0": {
             "sdtype": "categorical"
         }
     }
 }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions