fix: improved logging

justinmerrell · justinmerrell · commit 18efb476ba6b · 2023-04-11T22:09:16.000-04:00
diff --git a/docs/serverless/worker.md b/docs/serverless/worker.md
@@ -1,5 +1,19 @@
 # The Serverless Worker
 
+## Logging
+
+The worker outputs logs to the console at different points in the workers lifecycle. These logs can be used to debug issues with the worker or handler. There are four logging levels that can be used to control the verbosity of the logs:
+
+   1. `DEBUG` (Default) - Outputs all logs, including debug logs.
+
+   2. `INFO` - Outputs all logs except debug logs.
+
+   3. `WARNING` - Outputs only warning and error logs.
+
+   4. `ERROR` - Outputs only error logs.
+
+To set the logging level, set the `RUNPOD_DEBUG_LEVEL` environment variable to one of the above logging levels. For example, to set the logging level to `INFO`, set the `RUNPOD_DEBUG_LEVEL` environment variable to `INFO`.
+
 ## Error Handling
 
 The worker is designed to handle errors raised by the handler gracefully. If the handler raises an error, the worker will capture this error and return it as the job output along with the stack trace.
diff --git a/runpod/serverless/modules/heartbeat.py b/runpod/serverless/modules/heartbeat.py
@@ -41,6 +41,7 @@ def start_ping():
 
     _send_ping(ping_params)
 
+    log.debug(f"Scheduling next heartbeat in {PING_INTERVAL}ms")
     heartbeat_thread = threading.Timer(int(PING_INTERVAL / 1000), start_ping)
     heartbeat_thread.daemon = True
     heartbeat_thread.start()
diff --git a/runpod/serverless/modules/job.py b/runpod/serverless/modules/job.py
@@ -29,6 +29,7 @@ def _get_local():
     if "id" not in test_inputs:
         test_inputs["id"] = "local_test"
 
+    log.debug(f"Retrieved local job: {test_inputs}")
     return test_inputs
 
 
@@ -45,6 +46,7 @@ async def get_job(session):
         else:
             async with session.get(JOB_GET_URL) as response:
                 next_job = await response.json()
+                log.debug(f"Retrieved remote job: {next_job}")
 
         if next_job is not None:
             log.info(f"Received job: {next_job['id']}")
@@ -59,12 +61,14 @@ def run_job(handler, job):
     Run the job using the handler.
     Returns the job output or error.
     """
-    log.info(f'Started working on {job["id"]} at {time.time()} UTC')
+    start_time = time.time()
+    log.info(f'Started working on job {job["id"]} at {start_time} UTC')
 
     run_result = {"error": "Failed to return job output or capture error."}
 
     try:
         job_output = handler(job)
+        log.debug(f'Job {job["id"]} handler output: {job_output}')
 
         if isinstance(job_output, bool):
             run_result = {"output": job_output}
@@ -82,12 +86,12 @@ def run_job(handler, job):
         check_return_size(run_result)  # Checks the size of the return body.
     except Exception as err:    # pylint: disable=broad-except
         log.error(f'Error while running job {job["id"]}: {err}')
-
         run_result = {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"}
-
     finally:
-        log.info(f'Finished working on {job["id"]} at {time.time()} UTC')
-        log.info(f"Run result: {run_result}")
+        end_time = time.time()
+        log.info(f'Finished working on job {job["id"]} at {end_time} UTC')
+        log.info(f"Job {job['id']} took {end_time - start_time} seconds to complete")
+        log.debug(f"Run result: {run_result}")
 
         return run_result  # pylint: disable=lost-exception
 
@@ -102,15 +106,15 @@ async def retry_send_result(session, job_data):
         "Content-Type": "application/x-www-form-urlencoded"
     }
 
-    log.info("result api call")
+    log.debug("Initiating result API call")
     async with session.post(get_done_url(),
                             data=job_data,
                             headers=headers,
                             raise_for_status=True) as resp:
         result = await resp.text()
-        log.debug(result)
+        log.debug(f"Result API response: {result}")
 
-    log.info("done with result api call")
+    log.info("Completed result API call")
 
 
 async def send_result(session, job_data, job):
@@ -120,10 +124,10 @@ async def send_result(session, job_data, job):
     try:
         job_data = json.dumps(job_data, ensure_ascii=False)
         if not _IS_LOCAL_TEST:
-            log.info(f"Sending job results: {job_data}")
+            log.info(f"Sending job results for {job['id']}: {job_data}")
             await retry_send_result(session, job_data)
         else:
-            log.warn(f"Local test job results: {job_data}")
+            log.warn(f"Local test job results for {job['id']}: {job_data}")
 
     except Exception as err:  # pylint: disable=broad-except
         log.error(f"Error while returning job result {job['id']}: {err}")
diff --git a/runpod/serverless/work_loop.py b/runpod/serverless/work_loop.py
@@ -37,20 +37,21 @@ async def start_worker(config):
             job = await get_job(session)
 
             if job is None:
-                log.info("No job available before idle timeout.")
+                log.info("No job available, waiting for the next one.")
                 continue
 
             if job["input"] is None:
-                log.error("No input parameter provided. Erroring out request.")
+                log.error(f"Job {job['id']} has no input parameter provided. Skipping this job.")
                 continue
 
             set_job_id(job["id"])
 
+            log.info(f"Processing job {job['id']}")
             job_result = run_job(config["handler"], job)
 
             # If refresh_worker is set, pod will be reset after job is complete.
             if config.get("refresh_worker", False):
-                log.info("Refresh worker flag set, stopping pod after job.")
+                log.info(f"Refresh worker flag set, stopping pod after job {job['id']}.")
                 job_result["stopPod"] = True
 
             await send_result(session, job_result, job)