Merge pull request hed-standard#1292 from VisLab/fix_extras

VisLab · web-flow · commit 94dec8da6aac · 2026-04-03T13:35:27.000-05:00
Updated to support Pandas 3 - addresses issue hed-standard#1291
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,43 @@
+# Unreleased
+
+## New features
+
+### Pandas 3.0 compatibility
+
+All pandas 3.0 breaking changes have been addressed, and the pandas version constraint in `pyproject.toml` has been updated from `<3.0.0` to `<4.0.0`:
+
+- **Copy-on-Write (CoW)**: Chained `df[col][mask] = ...` assignments in `df_util.py` replaced with `df.loc[mask, col] = ...` to prevent silent no-ops and the new `ChainedAssignmentError`.
+- **`drop()` API**: Removed redundant `axis=1` argument when `columns=` is already specified in `data_util.py` (the two arguments conflict in pandas 3.0).
+- **NaN handling in schema loading**: `df2schema.py`, `df_util.py`, and `hed_id_util.py` now check `isinstance(value, str)` before calling string methods such as `.strip()` and `.startswith()`, preventing `AttributeError` when empty cells are `float` NaN rather than `""`.
+- **StringDtype in `_merge_dataframes`**: Fillna logic updated in `schema_io/df_util.py` to use `pd.api.types.is_numeric_dtype()` instead of `dtype == "object"`, correctly handling pandas 3.0 `StringDtype` columns.
+- **Float64 column FutureWarning**: `assign_hed_ids_section` in `hed_id_util.py` now casts all-NaN hedId columns from `float64` to `object` before assigning string values, eliminating a pandas deprecation warning.
+- Added `tests/test_pandas3_compat.py` with 27 targeted tests covering all of the above fixes.
+
+### Filename filter for `extract bids-sidecar`
+
+`hedpy extract bids-sidecar` and the underlying `hed_extract_bids_sidecar` script now accept a `--filter` / `-fl` option. Only files whose name contains the filter string are included in the sidecar extraction. Example:
+
+```bash
+hedpy extract bids-sidecar /path/to/dataset --filter sub-01
+```
+
+### `BidsFileGroup.get_task_names()`
+
+`BidsFileGroup` now exposes a `get_task_names()` method that returns a sorted list of unique task names (the `xxxx` portion of `task-xxxx` BIDS entities) found across all sidecar and data files in the group.
+
+### `TabularSummary` deduplicates `skip_cols`
+
+`TabularSummary.__init__` now deduplicates the `skip_cols` list using `dict.fromkeys`, preserving order. Passing the same column name more than once no longer produces duplicate entries in `skip_cols` or in the `"Skip columns"` field of the summary metadata output. Functional behaviour (which columns are skipped) is unchanged.
+
+## Documentation
+
+- Removed `{index}` placeholder annotations from `README.md` and `examples/README.md`.
+
+## CI/CD
+
+- Bumped `actions/configure-pages` from 5 to 6.
+- Updated `spec_tests/hed-tests` submodule.
+
 # Release 1.0.0 March 27, 2026
 
 This is a major release with breaking changes. It removes several subsystems that are no longer part of the core `hedtools` package, completes the schema library-extras support across all schema formats, and cleans up the public API.
diff --git a/hed/cli/cli.py b/hed/cli/cli.py
@@ -849,6 +849,9 @@ def extract():
     # Exclude specific columns from the template
     hedpy extract bids-sidecar /path/to/dataset -sc onset -sc duration -sc response_time
 
+    # Filter to only files containing 'sub-01' in their name
+    hedpy extract bids-sidecar /path/to/dataset --filter sub-01
+
     # Save logs to file and suppress console output
     hedpy extract bids-sidecar /path/to/dataset --log-file extraction.log --log-quiet
 """,
@@ -873,6 +876,14 @@ def extract():
     metavar=METAVAR_NAME,
     help="Directory names (relative to root) to exclude (e.g., -x sourcedata -x derivatives)",
 )
+@optgroup.option(
+    "-fl",
+    "--filter",
+    "filename_filter",
+    default=None,
+    metavar=METAVAR_STRING,
+    help="Filter string for filenames; only files containing this string in their name will be processed",
+)
 # Column processing options
 @optgroup.group("Column processing options")
 @optgroup.option(
@@ -933,6 +944,7 @@ def extract_bids_sidecar_cmd(
     ctx,
     data_path,
     suffix,
+    filename_filter,
     value_columns,
     skip_columns,
     log_level,
@@ -949,6 +961,8 @@ def extract_bids_sidecar_cmd(
     from hed.scripts.hed_extract_bids_sidecar import main as extract_main
 
     args = [data_path, "-s", suffix]
+    if filename_filter:
+        args.extend(["-fl", filename_filter])
     if value_columns:
         args.append("-vc")
         args.extend(value_columns)
diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -48,7 +48,7 @@ def shrink_defs(df, hed_schema, columns=None):
 
         for column in columns:
             mask = df[column].str.contains("Def-expand/", case=False)
-            df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema))
+            df.loc[mask, column] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema))
 
 
 def expand_defs(df, hed_schema, def_dict, columns=None):
@@ -307,11 +307,11 @@ def _filter_by_index_list(original_data, indexed_dict):
     else:
         raise TypeError("Input must be a pandas Series or DataFrame")
 
-    new_series = pd.Series([""] * len(data_series), dtype=str)
+    new_series = pd.Series([""] * len(data_series), dtype=data_series.dtype)
     for _onset, indices in indexed_dict.items():
         if indices:
             first_index = indices[0]
-            new_series[first_index] = ",".join([str(data_series[i]) for i in indices])
+            new_series.iloc[first_index] = ",".join([str(data_series.iloc[i]) for i in indices])
 
     if isinstance(original_data, pd.Series):
         return new_series
diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
@@ -99,10 +99,10 @@ def _get_prologue_epilogue(self, file_data):
         for _row_number, row in file_data[constants.STRUCT_KEY].iterrows():
             cls = row[constants.subclass_of]
             description = row[constants.dcdescription]
-            if cls == "HedPrologue" and description:
+            if cls == "HedPrologue" and isinstance(description, str) and description:
                 prologue = description.replace("\\n", "\n")
                 continue
-            elif cls == "HedEpilogue" and description:
+            elif cls == "HedEpilogue" and isinstance(description, str) and description:
                 epilogue = description.replace("\\n", "\n")
 
         return prologue, epilogue
@@ -237,13 +237,13 @@ def _create_entry(self, row_number, row, key_class, full_tag_name=None):
         node_attributes = self._get_tag_attributes(row_number, row)
 
         hed_id = row[constants.hed_id]
-        if hed_id:
+        if isinstance(hed_id, str) and hed_id:
             node_attributes[HedKey.HedID] = hed_id
 
         description = row[constants.dcdescription]
         tag_entry = self._schema._create_tag_entry(element_name, key_class)
 
-        if description:
+        if isinstance(description, str) and description:
             tag_entry.description = description.strip()
 
         for attribute_name, attribute_value in node_attributes.items():
diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py
@@ -123,12 +123,12 @@ def _merge_dataframes(df1, df2, key_column):
         if col not in df1.columns and col != key_column:
             df1 = df1.merge(df2[[key_column, col]], on=key_column, how="left")
 
-    # Fill missing values with '' for object columns, 0 for numeric columns
+    # Fill missing values with '' for non-numeric columns, 0 for numeric columns
     for col in df1.columns:
-        if df1[col].dtype == "object":
-            df1[col] = df1[col].fillna("")
-        else:
+        if pd.api.types.is_numeric_dtype(df1[col]):
             df1[col] = df1[col].fillna(0)
+        else:
+            df1[col] = df1[col].fillna("")
 
     return df1
 
@@ -311,6 +311,9 @@ def get_attributes_from_row(row):
     else:
         attr_string = ""
 
+    if not isinstance(attr_string, str):
+        attr_string = ""
+
     if constants.subclass_of in row.index and row[constants.subclass_of] == "HedHeader":
         header_attributes, _ = _parse_header_attributes_line(attr_string)
         return header_attributes
diff --git a/hed/schema/schema_io/hed_id_util.py b/hed/schema/schema_io/hed_id_util.py
@@ -155,6 +155,8 @@ def _verify_hedid_matches(section, df, unused_tag_ids):
         if label.endswith("-#"):
             label = label.replace("-#", "/#")
         df_id = row[constants.hed_id]
+        if not isinstance(df_id, str):
+            df_id = ""
         entry = section.get(label)
         if not entry:
             # Neither side has a hedID, so nothing to do.
@@ -209,10 +211,14 @@ def assign_hed_ids_section(df, unused_tag_ids):
     unused_tag_ids -= get_all_ids(df)
     sorted_unused_ids = sorted(unused_tag_ids, reverse=True)
 
+    # If the hedId column is float (all-NaN case), cast to object so strings can be assigned
+    if pd.api.types.is_float_dtype(df[constants.hed_id]):
+        df[constants.hed_id] = df[constants.hed_id].astype(object)
+
     for _row_number, row in df.iterrows():
         hed_id = row[constants.hed_id]
         # we already verified existing ones
-        if hed_id:
+        if isinstance(hed_id, str) and hed_id:
             continue
         df.at[_row_number, constants.hed_id] = f"HED_{sorted_unused_ids.pop():07d}"
 
diff --git a/hed/scripts/hed_extract_bids_sidecar.py b/hed/scripts/hed_extract_bids_sidecar.py
@@ -25,6 +25,9 @@
     # Exclude specific columns from the template
     hed_extract_bids_sidecar /path/to/dataset --skip-columns onset duration response_time
 
+    # Filter to only files containing 'sub-01' in their name
+    hed_extract_bids_sidecar /path/to/dataset --filter sub-01
+
     # Save logs to file and suppress console output
     hed_extract_bids_sidecar /path/to/dataset --log-file extraction.log --log-quiet
 """
@@ -33,8 +36,10 @@
 import json
 import logging
 import sys
+from pathlib import Path
 from hed import __version__
 from hed.tools import BidsDataset
+from hed.tools.analysis.tabular_summary import TabularSummary
 from hed.scripts.script_utils import setup_logging
 
 
@@ -75,6 +80,13 @@ def get_parser():
         dest="exclude_dirs",
         help="Directory names (relative to data_path) to exclude in search for files to process (default: sourcedata derivatives code stimuli)",
     )
+    file_group.add_argument(
+        "-fl",
+        "--filter",
+        dest="filename_filter",
+        default=None,
+        help="Optional string to filter filenames; only files containing this string in their name will be processed",
+    )
 
     # Column processing options
     column_group = parser.add_argument_group("Column processing options")
@@ -151,6 +163,7 @@ def extract_template(args):
     logger.info(f"HED tools version: {__version__}")
     logger.debug(f"Exclude directories: {args.exclude_dirs}")
     logger.debug(f"File suffix: {args.suffix}")
+    logger.debug(f"Filename filter: {args.filename_filter}")
     logger.debug(f"Value columns: {args.value_columns}")
     logger.debug(f"Skip columns: {args.skip_columns}")
 
@@ -177,9 +190,19 @@ def extract_template(args):
 
         logger.debug(f"Skip columns: {skip_cols}")
 
-        # Create TabularSummary using the summarize method of BidsFileGroup
+        # Build the file list, applying filename filter if specified
+        file_list = list(file_group.datafile_dict.keys())
+        if args.filename_filter:
+            original_count = len(file_list)
+            file_list = [f for f in file_list if args.filename_filter in Path(f).name]
+            logger.info(
+                f"Filename filter '{args.filename_filter}' reduced files from {original_count} to {len(file_list)}"
+            )
+
+        # Create TabularSummary from the (possibly filtered) file list
         logger.info("Creating tabular summary...")
-        summary = file_group.summarize(value_cols=args.value_columns, skip_cols=skip_cols)
+        summary = TabularSummary(value_cols=args.value_columns, skip_cols=skip_cols)
+        summary.update(file_list)
 
         logger.info(f"Processed {summary.total_files} files")
         logger.info(f"Total events: {summary.total_events}")
diff --git a/hed/tools/analysis/tabular_summary.py b/hed/tools/analysis/tabular_summary.py
@@ -37,7 +37,7 @@ def __init__(self, value_cols=None, skip_cols=None, name="", categorical_limit=N
             for value in value_cols:
                 self.value_info[value] = [0, 0]
         if skip_cols:
-            self.skip_cols = skip_cols.copy()
+            self.skip_cols = list(dict.fromkeys(skip_cols))
         else:
             self.skip_cols = []
         self.total_files = 0
diff --git a/hed/tools/bids/bids_file_group.py b/hed/tools/bids/bids_file_group.py
@@ -1,6 +1,7 @@
 """A group of BIDS files with specified suffix name."""
 
 import os
+import re
 import logging
 import pandas as pd
 
@@ -80,6 +81,26 @@ def summarize(self, value_cols=None, skip_cols=None):
         info.update(list(self.datafile_dict.keys()))
         return info
 
+    def get_task_names(self):
+        """Return a sorted list of unique task names found in the file group's TSV and JSON filenames.
+
+        Returns:
+            list:  Sorted list of unique task name strings (the ``xxxx`` portion of ``task-xxxx`` entities).
+
+        Notes:
+            - Parses both ``sidecar_dict`` and ``datafile_dict`` file paths.
+            - The BIDS ``task-`` entity is matched case-insensitively.
+
+        """
+        task_pattern = re.compile(r"(?:^|_)task-([^_.-]+)", re.IGNORECASE)
+        task_names = set()
+        for file_path in list(self.sidecar_dict) + list(self.datafile_dict):
+            basename = os.path.basename(file_path)
+            match = task_pattern.search(basename)
+            if match:
+                task_names.add(match.group(1))
+        return sorted(task_names)
+
     def validate(self, hed_schema, extra_def_dicts=None, check_for_warnings=False):
         """Validate the sidecars and datafiles and return a list of issues.
 
diff --git a/hed/tools/util/data_util.py b/hed/tools/util/data_util.py
@@ -62,7 +62,7 @@ def delete_columns(df, column_list):
     """
 
     delete_cols = list(set(column_list).intersection(set(df)))
-    df.drop(columns=delete_cols, axis=1, inplace=True)
+    df.drop(columns=delete_cols, inplace=True)
 
 
 def delete_rows_by_column(df, value, column_list=None):
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,7 +46,7 @@ dependencies = [
     "inflect>=7.5.0",
     "numpy>=2.0.2",
     "openpyxl>=3.1.5",
-    "pandas>=2.2.3,<3.0.0",
+    "pandas>=2.2.3,<4.0.0",
     "portalocker>=3.1.1",
     "semantic-version>=2.10.0"
 ]
diff --git a/tests/scripts/test_extract_bids_sidecar.py b/tests/scripts/test_extract_bids_sidecar.py
@@ -178,6 +178,70 @@ def test_exclude_directories(self):
             output_dict = json.loads(output)
             self.assertIn("sidecar_template", output_dict)
 
+    def test_parser_filename_filter_default(self):
+        """Test that filename_filter defaults to None."""
+        from hed.scripts.hed_extract_bids_sidecar import get_parser
+
+        parser = get_parser()
+        args = parser.parse_args([self.data_root])
+        self.assertIsNone(args.filename_filter)
+
+    def test_parser_filename_filter_argument(self):
+        """Test that --filter argument is parsed correctly."""
+        from hed.scripts.hed_extract_bids_sidecar import get_parser
+
+        parser = get_parser()
+        args = parser.parse_args([self.data_root, "-fl", "sub-002"])
+        self.assertEqual(args.filename_filter, "sub-002")
+
+    def test_filename_filter_restricts_files(self):
+        """Test that --filter only processes files matching the filter string."""
+        # Get full result without filter
+        arg_list_all = [self.data_root, "-s", "events"]
+        with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+            result = main(arg_list_all)
+            self.assertEqual(result, 0)
+            all_template = json.loads(mock_stdout.getvalue())["sidecar_template"]
+
+        # Get result filtered to sub-002 only (3 of ~10 event files)
+        arg_list_filtered = [self.data_root, "-s", "events", "-fl", "sub-002"]
+        with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+            result = main(arg_list_filtered)
+            self.assertEqual(result, 0)
+            filtered_template = json.loads(mock_stdout.getvalue())["sidecar_template"]
+
+        # Filtered template keys should be a subset of the full template keys
+        self.assertTrue(set(filtered_template.keys()).issubset(set(all_template.keys())))
+        # Should still capture the main categorical columns present in sub-002
+        self.assertIn("event_type", filtered_template)
+        self.assertGreater(len(filtered_template), 0)
+
+    def test_filename_filter_with_run(self):
+        """Test combining --filter with run selection produces fewer events than unfiltered."""
+        # All run-1 events files
+        arg_list_run1 = [self.data_root, "-s", "events", "-fl", "run-1"]
+        with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+            result = main(arg_list_run1)
+            self.assertEqual(result, 0)
+            run1_template = json.loads(mock_stdout.getvalue())["sidecar_template"]
+
+        # Should still produce categorical columns
+        self.assertIn("event_type", run1_template)
+        self.assertIn("face_type", run1_template)
+
+    def test_filename_filter_no_match_returns_empty_template(self):
+        """Test that a filter matching no files results in an empty sidecar template."""
+        arg_list = [self.data_root, "-s", "events", "-fl", "nonexistent_subject_xyz"]
+
+        with patch("sys.stdout", new=io.StringIO()) as mock_stdout:
+            result = main(arg_list)
+            self.assertEqual(result, 0)
+
+            output = mock_stdout.getvalue()
+            output_dict = json.loads(output)
+            template = output_dict["sidecar_template"]
+            self.assertEqual(template, {})
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_pandas3_compat.py b/tests/test_pandas3_compat.py
diff --git a/tests/tools/analysis/test_tabular_summary.py b/tests/tools/analysis/test_tabular_summary.py
diff --git a/tests/tools/bids/test_bids_file_group.py b/tests/tools/bids/test_bids_file_group.py