Skip to content

Commit 94dec8d

Browse files
authored
Merge pull request hed-standard#1292 from VisLab/fix_extras
Updated to support Pandas 3 - addresses issue hed-standard#1291
2 parents 0929da8 + f4086c4 commit 94dec8d

15 files changed

Lines changed: 700 additions & 17 deletions

CHANGELOG.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,43 @@
1+
# Unreleased
2+
3+
## New features
4+
5+
### Pandas 3.0 compatibility
6+
7+
All pandas 3.0 breaking changes have been addressed, and the pandas version constraint in `pyproject.toml` has been updated from `<3.0.0` to `<4.0.0`:
8+
9+
- **Copy-on-Write (CoW)**: Chained `df[col][mask] = ...` assignments in `df_util.py` replaced with `df.loc[mask, col] = ...` to prevent silent no-ops and the new `ChainedAssignmentError`.
10+
- **`drop()` API**: Removed redundant `axis=1` argument when `columns=` is already specified in `data_util.py` (the two arguments conflict in pandas 3.0).
11+
- **NaN handling in schema loading**: `df2schema.py`, `df_util.py`, and `hed_id_util.py` now check `isinstance(value, str)` before calling string methods such as `.strip()` and `.startswith()`, preventing `AttributeError` when empty cells are `float` NaN rather than `""`.
12+
- **StringDtype in `_merge_dataframes`**: Fillna logic updated in `schema_io/df_util.py` to use `pd.api.types.is_numeric_dtype()` instead of `dtype == "object"`, correctly handling pandas 3.0 `StringDtype` columns.
13+
- **Float64 column FutureWarning**: `assign_hed_ids_section` in `hed_id_util.py` now casts all-NaN hedId columns from `float64` to `object` before assigning string values, eliminating a pandas deprecation warning.
14+
- Added `tests/test_pandas3_compat.py` with 27 targeted tests covering all of the above fixes.
15+
16+
### Filename filter for `extract bids-sidecar`
17+
18+
`hedpy extract bids-sidecar` and the underlying `hed_extract_bids_sidecar` script now accept a `--filter` / `-fl` option. Only files whose name contains the filter string are included in the sidecar extraction. Example:
19+
20+
```bash
21+
hedpy extract bids-sidecar /path/to/dataset --filter sub-01
22+
```
23+
24+
### `BidsFileGroup.get_task_names()`
25+
26+
`BidsFileGroup` now exposes a `get_task_names()` method that returns a sorted list of unique task names (the `xxxx` portion of `task-xxxx` BIDS entities) found across all sidecar and data files in the group.
27+
28+
### `TabularSummary` deduplicates `skip_cols`
29+
30+
`TabularSummary.__init__` now deduplicates the `skip_cols` list using `dict.fromkeys`, preserving order. Passing the same column name more than once no longer produces duplicate entries in `skip_cols` or in the `"Skip columns"` field of the summary metadata output. Functional behaviour (which columns are skipped) is unchanged.
31+
32+
## Documentation
33+
34+
- Removed `{index}` placeholder annotations from `README.md` and `examples/README.md`.
35+
36+
## CI/CD
37+
38+
- Bumped `actions/configure-pages` from 5 to 6.
39+
- Updated `spec_tests/hed-tests` submodule.
40+
141
# Release 1.0.0 March 27, 2026
242

343
This is a major release with breaking changes. It removes several subsystems that are no longer part of the core `hedtools` package, completes the schema library-extras support across all schema formats, and cleans up the public API.

hed/cli/cli.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,9 @@ def extract():
849849
# Exclude specific columns from the template
850850
hedpy extract bids-sidecar /path/to/dataset -sc onset -sc duration -sc response_time
851851
852+
# Filter to only files containing 'sub-01' in their name
853+
hedpy extract bids-sidecar /path/to/dataset --filter sub-01
854+
852855
# Save logs to file and suppress console output
853856
hedpy extract bids-sidecar /path/to/dataset --log-file extraction.log --log-quiet
854857
""",
@@ -873,6 +876,14 @@ def extract():
873876
metavar=METAVAR_NAME,
874877
help="Directory names (relative to root) to exclude (e.g., -x sourcedata -x derivatives)",
875878
)
879+
@optgroup.option(
880+
"-fl",
881+
"--filter",
882+
"filename_filter",
883+
default=None,
884+
metavar=METAVAR_STRING,
885+
help="Filter string for filenames; only files containing this string in their name will be processed",
886+
)
876887
# Column processing options
877888
@optgroup.group("Column processing options")
878889
@optgroup.option(
@@ -933,6 +944,7 @@ def extract_bids_sidecar_cmd(
933944
ctx,
934945
data_path,
935946
suffix,
947+
filename_filter,
936948
value_columns,
937949
skip_columns,
938950
log_level,
@@ -949,6 +961,8 @@ def extract_bids_sidecar_cmd(
949961
from hed.scripts.hed_extract_bids_sidecar import main as extract_main
950962

951963
args = [data_path, "-s", suffix]
964+
if filename_filter:
965+
args.extend(["-fl", filename_filter])
952966
if value_columns:
953967
args.append("-vc")
954968
args.extend(value_columns)

hed/models/df_util.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def shrink_defs(df, hed_schema, columns=None):
4848

4949
for column in columns:
5050
mask = df[column].str.contains("Def-expand/", case=False)
51-
df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema))
51+
df.loc[mask, column] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema))
5252

5353

5454
def expand_defs(df, hed_schema, def_dict, columns=None):
@@ -307,11 +307,11 @@ def _filter_by_index_list(original_data, indexed_dict):
307307
else:
308308
raise TypeError("Input must be a pandas Series or DataFrame")
309309

310-
new_series = pd.Series([""] * len(data_series), dtype=str)
310+
new_series = pd.Series([""] * len(data_series), dtype=data_series.dtype)
311311
for _onset, indices in indexed_dict.items():
312312
if indices:
313313
first_index = indices[0]
314-
new_series[first_index] = ",".join([str(data_series[i]) for i in indices])
314+
new_series.iloc[first_index] = ",".join([str(data_series.iloc[i]) for i in indices])
315315

316316
if isinstance(original_data, pd.Series):
317317
return new_series

hed/schema/schema_io/df2schema.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,10 @@ def _get_prologue_epilogue(self, file_data):
9999
for _row_number, row in file_data[constants.STRUCT_KEY].iterrows():
100100
cls = row[constants.subclass_of]
101101
description = row[constants.dcdescription]
102-
if cls == "HedPrologue" and description:
102+
if cls == "HedPrologue" and isinstance(description, str) and description:
103103
prologue = description.replace("\\n", "\n")
104104
continue
105-
elif cls == "HedEpilogue" and description:
105+
elif cls == "HedEpilogue" and isinstance(description, str) and description:
106106
epilogue = description.replace("\\n", "\n")
107107

108108
return prologue, epilogue
@@ -237,13 +237,13 @@ def _create_entry(self, row_number, row, key_class, full_tag_name=None):
237237
node_attributes = self._get_tag_attributes(row_number, row)
238238

239239
hed_id = row[constants.hed_id]
240-
if hed_id:
240+
if isinstance(hed_id, str) and hed_id:
241241
node_attributes[HedKey.HedID] = hed_id
242242

243243
description = row[constants.dcdescription]
244244
tag_entry = self._schema._create_tag_entry(element_name, key_class)
245245

246-
if description:
246+
if isinstance(description, str) and description:
247247
tag_entry.description = description.strip()
248248

249249
for attribute_name, attribute_value in node_attributes.items():

hed/schema/schema_io/df_util.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,12 @@ def _merge_dataframes(df1, df2, key_column):
123123
if col not in df1.columns and col != key_column:
124124
df1 = df1.merge(df2[[key_column, col]], on=key_column, how="left")
125125

126-
# Fill missing values with '' for object columns, 0 for numeric columns
126+
# Fill missing values with '' for non-numeric columns, 0 for numeric columns
127127
for col in df1.columns:
128-
if df1[col].dtype == "object":
129-
df1[col] = df1[col].fillna("")
130-
else:
128+
if pd.api.types.is_numeric_dtype(df1[col]):
131129
df1[col] = df1[col].fillna(0)
130+
else:
131+
df1[col] = df1[col].fillna("")
132132

133133
return df1
134134

@@ -311,6 +311,9 @@ def get_attributes_from_row(row):
311311
else:
312312
attr_string = ""
313313

314+
if not isinstance(attr_string, str):
315+
attr_string = ""
316+
314317
if constants.subclass_of in row.index and row[constants.subclass_of] == "HedHeader":
315318
header_attributes, _ = _parse_header_attributes_line(attr_string)
316319
return header_attributes

hed/schema/schema_io/hed_id_util.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ def _verify_hedid_matches(section, df, unused_tag_ids):
155155
if label.endswith("-#"):
156156
label = label.replace("-#", "/#")
157157
df_id = row[constants.hed_id]
158+
if not isinstance(df_id, str):
159+
df_id = ""
158160
entry = section.get(label)
159161
if not entry:
160162
# Neither side has a hedID, so nothing to do.
@@ -209,10 +211,14 @@ def assign_hed_ids_section(df, unused_tag_ids):
209211
unused_tag_ids -= get_all_ids(df)
210212
sorted_unused_ids = sorted(unused_tag_ids, reverse=True)
211213

214+
# If the hedId column is float (all-NaN case), cast to object so strings can be assigned
215+
if pd.api.types.is_float_dtype(df[constants.hed_id]):
216+
df[constants.hed_id] = df[constants.hed_id].astype(object)
217+
212218
for _row_number, row in df.iterrows():
213219
hed_id = row[constants.hed_id]
214220
# we already verified existing ones
215-
if hed_id:
221+
if isinstance(hed_id, str) and hed_id:
216222
continue
217223
df.at[_row_number, constants.hed_id] = f"HED_{sorted_unused_ids.pop():07d}"
218224

hed/scripts/hed_extract_bids_sidecar.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
# Exclude specific columns from the template
2626
hed_extract_bids_sidecar /path/to/dataset --skip-columns onset duration response_time
2727
28+
# Filter to only files containing 'sub-01' in their name
29+
hed_extract_bids_sidecar /path/to/dataset --filter sub-01
30+
2831
# Save logs to file and suppress console output
2932
hed_extract_bids_sidecar /path/to/dataset --log-file extraction.log --log-quiet
3033
"""
@@ -33,8 +36,10 @@
3336
import json
3437
import logging
3538
import sys
39+
from pathlib import Path
3640
from hed import __version__
3741
from hed.tools import BidsDataset
42+
from hed.tools.analysis.tabular_summary import TabularSummary
3843
from hed.scripts.script_utils import setup_logging
3944

4045

@@ -75,6 +80,13 @@ def get_parser():
7580
dest="exclude_dirs",
7681
help="Directory names (relative to data_path) to exclude in search for files to process (default: sourcedata derivatives code stimuli)",
7782
)
83+
file_group.add_argument(
84+
"-fl",
85+
"--filter",
86+
dest="filename_filter",
87+
default=None,
88+
help="Optional string to filter filenames; only files containing this string in their name will be processed",
89+
)
7890

7991
# Column processing options
8092
column_group = parser.add_argument_group("Column processing options")
@@ -151,6 +163,7 @@ def extract_template(args):
151163
logger.info(f"HED tools version: {__version__}")
152164
logger.debug(f"Exclude directories: {args.exclude_dirs}")
153165
logger.debug(f"File suffix: {args.suffix}")
166+
logger.debug(f"Filename filter: {args.filename_filter}")
154167
logger.debug(f"Value columns: {args.value_columns}")
155168
logger.debug(f"Skip columns: {args.skip_columns}")
156169

@@ -177,9 +190,19 @@ def extract_template(args):
177190

178191
logger.debug(f"Skip columns: {skip_cols}")
179192

180-
# Create TabularSummary using the summarize method of BidsFileGroup
193+
# Build the file list, applying filename filter if specified
194+
file_list = list(file_group.datafile_dict.keys())
195+
if args.filename_filter:
196+
original_count = len(file_list)
197+
file_list = [f for f in file_list if args.filename_filter in Path(f).name]
198+
logger.info(
199+
f"Filename filter '{args.filename_filter}' reduced files from {original_count} to {len(file_list)}"
200+
)
201+
202+
# Create TabularSummary from the (possibly filtered) file list
181203
logger.info("Creating tabular summary...")
182-
summary = file_group.summarize(value_cols=args.value_columns, skip_cols=skip_cols)
204+
summary = TabularSummary(value_cols=args.value_columns, skip_cols=skip_cols)
205+
summary.update(file_list)
183206

184207
logger.info(f"Processed {summary.total_files} files")
185208
logger.info(f"Total events: {summary.total_events}")

hed/tools/analysis/tabular_summary.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(self, value_cols=None, skip_cols=None, name="", categorical_limit=N
3737
for value in value_cols:
3838
self.value_info[value] = [0, 0]
3939
if skip_cols:
40-
self.skip_cols = skip_cols.copy()
40+
self.skip_cols = list(dict.fromkeys(skip_cols))
4141
else:
4242
self.skip_cols = []
4343
self.total_files = 0

hed/tools/bids/bids_file_group.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""A group of BIDS files with specified suffix name."""
22

33
import os
4+
import re
45
import logging
56
import pandas as pd
67

@@ -80,6 +81,26 @@ def summarize(self, value_cols=None, skip_cols=None):
8081
info.update(list(self.datafile_dict.keys()))
8182
return info
8283

84+
def get_task_names(self):
85+
"""Return a sorted list of unique task names found in the file group's TSV and JSON filenames.
86+
87+
Returns:
88+
list: Sorted list of unique task name strings (the ``xxxx`` portion of ``task-xxxx`` entities).
89+
90+
Notes:
91+
- Parses both ``sidecar_dict`` and ``datafile_dict`` file paths.
92+
- The BIDS ``task-`` entity is matched case-insensitively.
93+
94+
"""
95+
task_pattern = re.compile(r"(?:^|_)task-([^_.-]+)", re.IGNORECASE)
96+
task_names = set()
97+
for file_path in list(self.sidecar_dict) + list(self.datafile_dict):
98+
basename = os.path.basename(file_path)
99+
match = task_pattern.search(basename)
100+
if match:
101+
task_names.add(match.group(1))
102+
return sorted(task_names)
103+
83104
def validate(self, hed_schema, extra_def_dicts=None, check_for_warnings=False):
84105
"""Validate the sidecars and datafiles and return a list of issues.
85106

hed/tools/util/data_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def delete_columns(df, column_list):
6262
"""
6363

6464
delete_cols = list(set(column_list).intersection(set(df)))
65-
df.drop(columns=delete_cols, axis=1, inplace=True)
65+
df.drop(columns=delete_cols, inplace=True)
6666

6767

6868
def delete_rows_by_column(df, value, column_list=None):

0 commit comments

Comments
 (0)