Skip to content

Commit 18efb47

Browse files
committed
fix: improved logging
1 parent 7be0806 commit 18efb47

4 files changed

Lines changed: 33 additions & 13 deletions

File tree

docs/serverless/worker.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# The Serverless Worker
22

3+
## Logging
4+
5+
The worker outputs logs to the console at different points in the workers lifecycle. These logs can be used to debug issues with the worker or handler. There are four logging levels that can be used to control the verbosity of the logs:
6+
7+
1. `DEBUG` (Default) - Outputs all logs, including debug logs.
8+
9+
2. `INFO` - Outputs all logs except debug logs.
10+
11+
3. `WARNING` - Outputs only warning and error logs.
12+
13+
4. `ERROR` - Outputs only error logs.
14+
15+
To set the logging level, set the `RUNPOD_DEBUG_LEVEL` environment variable to one of the above logging levels. For example, to set the logging level to `INFO`, set the `RUNPOD_DEBUG_LEVEL` environment variable to `INFO`.
16+
317
## Error Handling
418

519
The worker is designed to handle errors raised by the handler gracefully. If the handler raises an error, the worker will capture this error and return it as the job output along with the stack trace.

runpod/serverless/modules/heartbeat.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def start_ping():
4141

4242
_send_ping(ping_params)
4343

44+
log.debug(f"Scheduling next heartbeat in {PING_INTERVAL}ms")
4445
heartbeat_thread = threading.Timer(int(PING_INTERVAL / 1000), start_ping)
4546
heartbeat_thread.daemon = True
4647
heartbeat_thread.start()

runpod/serverless/modules/job.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def _get_local():
2929
if "id" not in test_inputs:
3030
test_inputs["id"] = "local_test"
3131

32+
log.debug(f"Retrieved local job: {test_inputs}")
3233
return test_inputs
3334

3435

@@ -45,6 +46,7 @@ async def get_job(session):
4546
else:
4647
async with session.get(JOB_GET_URL) as response:
4748
next_job = await response.json()
49+
log.debug(f"Retrieved remote job: {next_job}")
4850

4951
if next_job is not None:
5052
log.info(f"Received job: {next_job['id']}")
@@ -59,12 +61,14 @@ def run_job(handler, job):
5961
Run the job using the handler.
6062
Returns the job output or error.
6163
"""
62-
log.info(f'Started working on {job["id"]} at {time.time()} UTC')
64+
start_time = time.time()
65+
log.info(f'Started working on job {job["id"]} at {start_time} UTC')
6366

6467
run_result = {"error": "Failed to return job output or capture error."}
6568

6669
try:
6770
job_output = handler(job)
71+
log.debug(f'Job {job["id"]} handler output: {job_output}')
6872

6973
if isinstance(job_output, bool):
7074
run_result = {"output": job_output}
@@ -82,12 +86,12 @@ def run_job(handler, job):
8286
check_return_size(run_result) # Checks the size of the return body.
8387
except Exception as err: # pylint: disable=broad-except
8488
log.error(f'Error while running job {job["id"]}: {err}')
85-
8689
run_result = {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"}
87-
8890
finally:
89-
log.info(f'Finished working on {job["id"]} at {time.time()} UTC')
90-
log.info(f"Run result: {run_result}")
91+
end_time = time.time()
92+
log.info(f'Finished working on job {job["id"]} at {end_time} UTC')
93+
log.info(f"Job {job['id']} took {end_time - start_time} seconds to complete")
94+
log.debug(f"Run result: {run_result}")
9195

9296
return run_result # pylint: disable=lost-exception
9397

@@ -102,15 +106,15 @@ async def retry_send_result(session, job_data):
102106
"Content-Type": "application/x-www-form-urlencoded"
103107
}
104108

105-
log.info("result api call")
109+
log.debug("Initiating result API call")
106110
async with session.post(get_done_url(),
107111
data=job_data,
108112
headers=headers,
109113
raise_for_status=True) as resp:
110114
result = await resp.text()
111-
log.debug(result)
115+
log.debug(f"Result API response: {result}")
112116

113-
log.info("done with result api call")
117+
log.info("Completed result API call")
114118

115119

116120
async def send_result(session, job_data, job):
@@ -120,10 +124,10 @@ async def send_result(session, job_data, job):
120124
try:
121125
job_data = json.dumps(job_data, ensure_ascii=False)
122126
if not _IS_LOCAL_TEST:
123-
log.info(f"Sending job results: {job_data}")
127+
log.info(f"Sending job results for {job['id']}: {job_data}")
124128
await retry_send_result(session, job_data)
125129
else:
126-
log.warn(f"Local test job results: {job_data}")
130+
log.warn(f"Local test job results for {job['id']}: {job_data}")
127131

128132
except Exception as err: # pylint: disable=broad-except
129133
log.error(f"Error while returning job result {job['id']}: {err}")

runpod/serverless/work_loop.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,21 @@ async def start_worker(config):
3737
job = await get_job(session)
3838

3939
if job is None:
40-
log.info("No job available before idle timeout.")
40+
log.info("No job available, waiting for the next one.")
4141
continue
4242

4343
if job["input"] is None:
44-
log.error("No input parameter provided. Erroring out request.")
44+
log.error(f"Job {job['id']} has no input parameter provided. Skipping this job.")
4545
continue
4646

4747
set_job_id(job["id"])
4848

49+
log.info(f"Processing job {job['id']}")
4950
job_result = run_job(config["handler"], job)
5051

5152
# If refresh_worker is set, pod will be reset after job is complete.
5253
if config.get("refresh_worker", False):
53-
log.info("Refresh worker flag set, stopping pod after job.")
54+
log.info(f"Refresh worker flag set, stopping pod after job {job['id']}.")
5455
job_result["stopPod"] = True
5556

5657
await send_result(session, job_result, job)

0 commit comments

Comments
 (0)