diff --git a/docarray/document/mixins/featurehash.py b/docarray/document/mixins/featurehash.py index 04f692e2504..56e474e9d96 100644 --- a/docarray/document/mixins/featurehash.py +++ b/docarray/document/mixins/featurehash.py @@ -67,19 +67,16 @@ def _any_hash(v): try: return int(v) # parse int parameter except ValueError: - try: - return float(v) # parse float parameter - except ValueError: - if not v: - # ignore it when the parameter is empty + if not v: + # ignore it when the parameter is empty + return 0 + if isinstance(v, str): + v = v.strip() + if v.lower() in {'true', 'yes'}: # parse boolean parameter + return 1 + if v.lower() in {'false', 'no'}: return 0 - if isinstance(v, str): - v = v.strip() - if v.lower() in {'true', 'yes'}: # parse boolean parameter - return 1 - if v.lower() in {'false', 'no'}: - return 0 - if isinstance(v, (tuple, dict, list)): - v = json.dumps(v, sort_keys=True) + if isinstance(v, (tuple, dict, list)): + v = json.dumps(v, sort_keys=True) return int(hashlib.md5(str(v).encode('utf-8')).hexdigest(), base=16) diff --git a/tests/unit/document/test_feature_hashing.py b/tests/unit/document/test_feature_hashing.py index edd81932709..8b3d9aa8c8f 100644 --- a/tests/unit/document/test_feature_hashing.py +++ b/tests/unit/document/test_feature_hashing.py @@ -8,10 +8,17 @@ @pytest.mark.parametrize('sparse', [True, False]) @pytest.mark.parametrize('metric', ['jaccard', 'cosine']) def test_feature_hashing(n_dim, sparse, metric): - da = DocumentArray.empty(3) - da.texts = ['hello world', 'world, bye', 'hello bye'] + da = DocumentArray.empty(6) + da.texts = [ + 'hello world', + 'world, bye', + 'hello bye', + 'infinity test', + 'nan test', + '2.3 test', + ] da.apply(lambda d: d.embed_feature_hashing(n_dim=n_dim, sparse=sparse)) - assert da.embeddings.shape == (3, n_dim) + assert da.embeddings.shape == (6, n_dim) da.embeddings = to_numpy_array(da.embeddings) da.match(da, metric=metric, use_scipy=True) result = da['@m', ('id', f'scores__{metric}__value')]