Skip to content

Commit 616bc45

Browse files
update eval_utils with runs api change
1 parent d3f0888 commit 616bc45

File tree

1 file changed

+43
-25
lines changed

1 file changed

+43
-25
lines changed

src/humanloop/eval_utils.py

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from typing_extensions import NotRequired, TypedDict
1919
import time
2020
import sys
21-
import uuid
2221
from concurrent.futures import ThreadPoolExecutor, as_completed
2322

2423
from .client import BaseHumanloop
@@ -45,7 +44,7 @@
4544
from .types import DatapointResponse as Datapoint
4645
from .types import (
4746
EvaluationStats,
48-
VersionStatsResponse,
47+
RunStatsResponse,
4948
EvaluatorArgumentsType,
5049
EvaluatorReturnTypeEnum,
5150
EvaluationResponse,
@@ -301,15 +300,22 @@ def _run_eval(
301300
if not evaluation:
302301
raise ValueError(f"Evaluation with name {name} not found.")
303302

304-
# Every run will generate a new batch of logs
305-
batch_id = uuid.uuid4().hex[:10] # ignore risk of collision
303+
# Create a new Run
304+
run = client.evaluations.create_run(
305+
id=evaluation.id,
306+
dataset={"file_id": hl_dataset.id},
307+
logs="fixed",
308+
orchestrated=False,
309+
)
310+
311+
# Every Run will generate a new batch of Logs
312+
run_id = run.id
306313
log_func = _get_log_func(
307314
client=client,
308315
type_=type_,
309316
file_id=hl_file.id,
310317
version_id=hl_file.version_id,
311-
evaluation_id=evaluation.id,
312-
batch_id=batch_id,
318+
run_id=run_id,
313319
)
314320

315321
# Define the function to execute your function in parallel and Log to Humanloop
@@ -382,7 +388,7 @@ def process_datapoint(datapoint: Datapoint):
382388
total_datapoints = len(hl_dataset.datapoints)
383389
logger.info(f"\n{CYAN}Navigate to your Evaluation:{RESET}\n{evaluation.url}\n")
384390
logger.info(f"{CYAN}{type_.capitalize()} Version ID: {hl_file.version_id}{RESET}")
385-
logger.info(f"{CYAN}Run ID: {batch_id}{RESET}")
391+
logger.info(f"{CYAN}Run ID: {run_id}{RESET}")
386392

387393
# Generate locally if a file `callable` is provided
388394
if function_:
@@ -413,7 +419,10 @@ def process_datapoint(datapoint: Datapoint):
413419
logger.info(stats.report)
414420

415421
checks: List[EvaluatorCheck] = []
416-
if all(evaluator.get("threshold") is None for evaluator in evaluators) and len(stats.version_stats) == 1:
422+
if (
423+
all(evaluator.get("threshold") is None for evaluator in evaluators)
424+
and len(stats.run_stats) == 1
425+
):
417426
# Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
418427
# (Or the logs would not be helpful)
419428
return checks
@@ -422,7 +431,7 @@ def process_datapoint(datapoint: Datapoint):
422431
evaluation=evaluation,
423432
stats=stats,
424433
evaluator_path=evaluator["path"],
425-
batch_id=batch_id,
434+
run_id=run_id,
426435
)
427436
threshold_check = None
428437
threshold = evaluator.get("threshold")
@@ -432,7 +441,7 @@ def process_datapoint(datapoint: Datapoint):
432441
stats=stats,
433442
evaluator_path=evaluator["path"],
434443
threshold=threshold,
435-
batch_id=batch_id,
444+
run_id=run_id,
436445
)
437446
checks.append(
438447
EvaluatorCheck(
@@ -455,17 +464,15 @@ def _get_log_func(
455464
type_: FileType,
456465
file_id: str,
457466
version_id: str,
458-
evaluation_id: str,
459-
batch_id: str,
467+
run_id: str,
460468
) -> Callable:
461469
"""Returns the appropriate log function pre-filled with common parameters."""
462470
log_request = {
463471
# TODO: why does the Log `id` field refer to the file ID in the API?
464472
# Why are both `id` and `version_id` needed in the API?
465473
"id": file_id,
466474
"version_id": version_id,
467-
"evaluation_id": evaluation_id,
468-
"batch_id": batch_id,
475+
"run_id": run_id,
469476
}
470477
if type_ == "flow":
471478
return partial(client.flows.log, **log_request, trace_status="complete")
@@ -526,14 +533,18 @@ def _progress_bar(total: int, progress: int):
526533

527534

528535
def get_evaluator_stats_by_path(
529-
stat: VersionStatsResponse, evaluation: EvaluationResponse
536+
stat: RunStatsResponse, evaluation: EvaluationResponse
530537
) -> Dict[str, Union[NumericStats, BooleanStats]]:
531538
"""Get the Evaluator stats by path."""
532539
# TODO: Update the API so this is not necessary
533-
evaluators_by_id = {evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators}
540+
evaluators_by_id = {
541+
evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators
542+
}
534543
evaluator_stats_by_path = {
535-
evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat
536-
for evaluator_stat in stat.evaluator_version_stats
544+
evaluators_by_id[
545+
evaluator_stat.evaluator_version_id
546+
].version.path: evaluator_stat
547+
for evaluator_stat in stat.evaluator_stats
537548
}
538549
return evaluator_stats_by_path
539550

@@ -543,12 +554,13 @@ def check_evaluation_threshold(
543554
stats: EvaluationStats,
544555
evaluator_path: str,
545556
threshold: float,
546-
batch_id: str,
557+
run_id: str,
547558
) -> bool:
548559
"""Checks if the latest version has an average Evaluator result above a threshold."""
549560
# TODO: Update the API so this is not necessary
550561
evaluator_stats_by_path = get_evaluator_stats_by_path(
551-
stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
562+
stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None),
563+
evaluation=evaluation,
552564
)
553565
if evaluator_path in evaluator_stats_by_path:
554566
evaluator_stat = evaluator_stats_by_path[evaluator_path]
@@ -571,7 +583,7 @@ def check_evaluation_improvement(
571583
evaluation: EvaluationResponse,
572584
evaluator_path: str,
573585
stats: EvaluationStats,
574-
batch_id: str,
586+
run_id: str,
575587
) -> Tuple[bool, float, float]:
576588
"""
577589
Check the latest version has improved across for a specific Evaluator.
@@ -581,14 +593,20 @@ def check_evaluation_improvement(
581593
# TODO: Update the API so this is not necessary
582594

583595
latest_evaluator_stats_by_path = get_evaluator_stats_by_path(
584-
stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
596+
stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None),
597+
evaluation=evaluation,
585598
)
586-
if len(stats.version_stats) == 1:
599+
if len(stats.run_stats) == 1:
587600
logger.info(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
588601
return True, 0, 0
589602

590-
previous_evaluator_stats_by_path = get_evaluator_stats_by_path(stat=stats.version_stats[-2], evaluation=evaluation)
591-
if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
603+
previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
604+
stat=stats.run_stats[-2], evaluation=evaluation
605+
)
606+
if (
607+
evaluator_path in latest_evaluator_stats_by_path
608+
and evaluator_path in previous_evaluator_stats_by_path
609+
):
592610
latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
593611
previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
594612
latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)

0 commit comments

Comments
 (0)