1818from typing_extensions import NotRequired , TypedDict
1919import time
2020import sys
21- import uuid
2221from concurrent .futures import ThreadPoolExecutor , as_completed
2322
2423from .client import BaseHumanloop
4544from .types import DatapointResponse as Datapoint
4645from .types import (
4746 EvaluationStats ,
48- VersionStatsResponse ,
47+ RunStatsResponse ,
4948 EvaluatorArgumentsType ,
5049 EvaluatorReturnTypeEnum ,
5150 EvaluationResponse ,
@@ -301,15 +300,22 @@ def _run_eval(
301300 if not evaluation :
302301 raise ValueError (f"Evaluation with name { name } not found." )
303302
304- # Every run will generate a new batch of logs
305- batch_id = uuid .uuid4 ().hex [:10 ] # ignore risk of collision
303+ # Create a new Run
304+ run = client .evaluations .create_run (
305+ id = evaluation .id ,
306+ dataset = {"file_id" : hl_dataset .id },
307+ logs = "fixed" ,
308+ orchestrated = False ,
309+ )
310+
311+ # Every Run will generate a new batch of Logs
312+ run_id = run .id
306313 log_func = _get_log_func (
307314 client = client ,
308315 type_ = type_ ,
309316 file_id = hl_file .id ,
310317 version_id = hl_file .version_id ,
311- evaluation_id = evaluation .id ,
312- batch_id = batch_id ,
318+ run_id = run_id ,
313319 )
314320
315321 # Define the function to execute your function in parallel and Log to Humanloop
@@ -382,7 +388,7 @@ def process_datapoint(datapoint: Datapoint):
382388 total_datapoints = len (hl_dataset .datapoints )
383389 logger .info (f"\n { CYAN } Navigate to your Evaluation:{ RESET } \n { evaluation .url } \n " )
384390 logger .info (f"{ CYAN } { type_ .capitalize ()} Version ID: { hl_file .version_id } { RESET } " )
385- logger .info (f"{ CYAN } Run ID: { batch_id } { RESET } " )
391+ logger .info (f"{ CYAN } Run ID: { run_id } { RESET } " )
386392
387393 # Generate locally if a file `callable` is provided
388394 if function_ :
@@ -413,7 +419,10 @@ def process_datapoint(datapoint: Datapoint):
413419 logger .info (stats .report )
414420
415421 checks : List [EvaluatorCheck ] = []
416- if all (evaluator .get ("threshold" ) is None for evaluator in evaluators ) and len (stats .version_stats ) == 1 :
422+ if (
423+ all (evaluator .get ("threshold" ) is None for evaluator in evaluators )
424+ and len (stats .run_stats ) == 1
425+ ):
417426 # Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
418427 # (Or the logs would not be helpful)
419428 return checks
@@ -422,7 +431,7 @@ def process_datapoint(datapoint: Datapoint):
422431 evaluation = evaluation ,
423432 stats = stats ,
424433 evaluator_path = evaluator ["path" ],
425- batch_id = batch_id ,
434+ run_id = run_id ,
426435 )
427436 threshold_check = None
428437 threshold = evaluator .get ("threshold" )
@@ -432,7 +441,7 @@ def process_datapoint(datapoint: Datapoint):
432441 stats = stats ,
433442 evaluator_path = evaluator ["path" ],
434443 threshold = threshold ,
435- batch_id = batch_id ,
444+ run_id = run_id ,
436445 )
437446 checks .append (
438447 EvaluatorCheck (
@@ -455,17 +464,15 @@ def _get_log_func(
455464 type_ : FileType ,
456465 file_id : str ,
457466 version_id : str ,
458- evaluation_id : str ,
459- batch_id : str ,
467+ run_id : str ,
460468) -> Callable :
461469 """Returns the appropriate log function pre-filled with common parameters."""
462470 log_request = {
463471 # TODO: why does the Log `id` field refer to the file ID in the API?
464472 # Why are both `id` and `version_id` needed in the API?
465473 "id" : file_id ,
466474 "version_id" : version_id ,
467- "evaluation_id" : evaluation_id ,
468- "batch_id" : batch_id ,
475+ "run_id" : run_id ,
469476 }
470477 if type_ == "flow" :
471478 return partial (client .flows .log , ** log_request , trace_status = "complete" )
@@ -526,14 +533,18 @@ def _progress_bar(total: int, progress: int):
526533
527534
528535def get_evaluator_stats_by_path (
529- stat : VersionStatsResponse , evaluation : EvaluationResponse
536+ stat : RunStatsResponse , evaluation : EvaluationResponse
530537) -> Dict [str , Union [NumericStats , BooleanStats ]]:
531538 """Get the Evaluator stats by path."""
532539 # TODO: Update the API so this is not necessary
533- evaluators_by_id = {evaluator .version .version_id : evaluator for evaluator in evaluation .evaluators }
540+ evaluators_by_id = {
541+ evaluator .version .version_id : evaluator for evaluator in evaluation .evaluators
542+ }
534543 evaluator_stats_by_path = {
535- evaluators_by_id [evaluator_stat .evaluator_version_id ].version .path : evaluator_stat
536- for evaluator_stat in stat .evaluator_version_stats
544+ evaluators_by_id [
545+ evaluator_stat .evaluator_version_id
546+ ].version .path : evaluator_stat
547+ for evaluator_stat in stat .evaluator_stats
537548 }
538549 return evaluator_stats_by_path
539550
@@ -543,12 +554,13 @@ def check_evaluation_threshold(
543554 stats : EvaluationStats ,
544555 evaluator_path : str ,
545556 threshold : float ,
546- batch_id : str ,
557+ run_id : str ,
547558) -> bool :
548559 """Checks if the latest version has an average Evaluator result above a threshold."""
549560 # TODO: Update the API so this is not necessary
550561 evaluator_stats_by_path = get_evaluator_stats_by_path (
551- stat = next ((stat for stat in stats .version_stats if stat .batch_id == batch_id ), None ), evaluation = evaluation
562+ stat = next ((stat for stat in stats .run_stats if stat .run_id == run_id ), None ),
563+ evaluation = evaluation ,
552564 )
553565 if evaluator_path in evaluator_stats_by_path :
554566 evaluator_stat = evaluator_stats_by_path [evaluator_path ]
@@ -571,7 +583,7 @@ def check_evaluation_improvement(
571583 evaluation : EvaluationResponse ,
572584 evaluator_path : str ,
573585 stats : EvaluationStats ,
574- batch_id : str ,
586+ run_id : str ,
575587) -> Tuple [bool , float , float ]:
576588 """
577589 Check the latest version has improved across for a specific Evaluator.
@@ -581,14 +593,20 @@ def check_evaluation_improvement(
581593 # TODO: Update the API so this is not necessary
582594
583595 latest_evaluator_stats_by_path = get_evaluator_stats_by_path (
584- stat = next ((stat for stat in stats .version_stats if stat .batch_id == batch_id ), None ), evaluation = evaluation
596+ stat = next ((stat for stat in stats .run_stats if stat .run_id == run_id ), None ),
597+ evaluation = evaluation ,
585598 )
586- if len (stats .version_stats ) == 1 :
599+ if len (stats .run_stats ) == 1 :
587600 logger .info (f"{ YELLOW } ⚠️ No previous versions to compare with.{ RESET } " )
588601 return True , 0 , 0
589602
590- previous_evaluator_stats_by_path = get_evaluator_stats_by_path (stat = stats .version_stats [- 2 ], evaluation = evaluation )
591- if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path :
603+ previous_evaluator_stats_by_path = get_evaluator_stats_by_path (
604+ stat = stats .run_stats [- 2 ], evaluation = evaluation
605+ )
606+ if (
607+ evaluator_path in latest_evaluator_stats_by_path
608+ and evaluator_path in previous_evaluator_stats_by_path
609+ ):
592610 latest_evaluator_stat = latest_evaluator_stats_by_path [evaluator_path ]
593611 previous_evaluator_stat = previous_evaluator_stats_by_path [evaluator_path ]
594612 latest_score = get_score_from_evaluator_stat (stat = latest_evaluator_stat )
0 commit comments