1- from collections .abc import Mapping , Sequence
1+ import contextlib
2+ import io
3+ import logging
4+ from typing import Mapping , Sequence
25
36from asgiref .sync import async_to_sync
47from django .conf import settings
58from health_check .cache .backends import CacheBackend
6- from health_check .contrib .psutil .backends import MemoryUsage
79from health_check .db .backends import (
10+ BaseHealthCheckBackend ,
811 DatabaseBackend ,
912 ServiceUnavailable ,
10- BaseHealthCheckBackend ,
1113)
1214
1315from .base import DjangoHealthCheckWrapper , HealthCheck , Outcome , Status
1416from .models import Event
1517
18+ # Get a logger instance
19+ logger = logging .getLogger (__name__ )
20+
1621
1722def disp_window (window : Mapping [str , int ]) -> str :
1823 """
@@ -104,41 +109,6 @@ def get_name(self) -> str:
104109 return "Database"
105110
106111
107- class RamUsage (DjangoHealthCheckWrapper ):
108- """
109- Checks that we don't use too much RAM
110- """
111-
112- base_class = MemoryUsage
113-
114- def get_name (self ) -> str :
115- return "RAM Usage"
116-
117- def get_resolving_actions (self , outcome : Outcome ) -> str :
118- return """# __CODE__ — RAM usage is too high
119-
120- The memory usage in the container running the application is too high.
121-
122- ## Possible causes
123-
124- - There is a memory leak in the application
125- - The application just needs more RAM
126-
127- ## Possible solutions
128-
129- - Short term, restart the container
130- - Long term, identify if this issue comes from a leak (in which case you can
131- fix the leak) or if the application just needs more RAM (in which case you
132- can increase the RAM allocated to the container)
133- """
134-
135- def suggest_reboot (self , outcome : Outcome ) -> Sequence [str ]:
136- return ["api" ]
137-
138-
139- # :: IF api__redis
140-
141-
142112class Cache (DjangoHealthCheckWrapper ):
143113 """
144114 Validates cache accessibility. Since the queue is also the cache, it will
@@ -172,9 +142,6 @@ def suggest_reboot(self, outcome: Outcome) -> Sequence[str]:
172142 return ["redis" ]
173143
174144
175- # :: ENDIF
176-
177-
178145class ProcrastinateBuiltInHealthCheck (BaseHealthCheckBackend ):
179146 """
180147 Health check for Procrastinate task processor.
@@ -201,8 +168,8 @@ def check_status(self):
201168 Use the built-in healthchecks to check if the Procrastinate app is
202169 working.
203170 """
204- from procrastinate .contrib .django .healthchecks import healthchecks
205171 from procrastinate import exceptions
172+ from procrastinate .contrib .django .healthchecks import healthchecks
206173
207174 try :
208175 async_to_sync (healthchecks )(app = self .app )
@@ -252,3 +219,151 @@ def get_resolving_actions(self, outcome: Outcome) -> str:
252219
253220 def suggest_reboot (self , outcome : Outcome ) -> Sequence [str ]:
254221 return ["procrastinate_worker" ]
222+
223+
224+ class ModuleSimulationBase (HealthCheck ):
225+ """
226+ Base class for module simulation health checks.
227+ Checks the status by looking at the results of the last periodic task run.
228+ """
229+
230+ MODULE_NUMBER = None
231+ MODULE_NAME = None
232+ WINDOW = dict (hours = 7 )
233+
234+ def get_name (self ) -> str :
235+ return f"{ self .MODULE_NAME } Simulation"
236+
237+
238+ def get_status (self ) -> Outcome :
239+ """
240+ Checks the status of the module simulation based on the latest event
241+ recorded by the periodic Procrastinate task.
242+ """
243+ event_type = f"module_{ self .MODULE_NUMBER } _simulation"
244+ stats = Event .objects .type (event_type ).within (** self .WINDOW ).stats ()
245+ stats_str = disp_stats (stats )
246+
247+ if stats ["total" ] == 0 :
248+ # No events found, means the task likely didn't run
249+ outcome = dict (
250+ status = Status .ERROR ,
251+ message = f"No simulation task events found in the last { disp_window (self .WINDOW )} " ,
252+ )
253+ elif stats ["failure" ]:
254+ outcome = dict (
255+ status = Status .ERROR ,
256+ message = f"{ stats_str } in the last { disp_window (self .WINDOW )} " ,
257+ )
258+ else :
259+ outcome = dict (
260+ status = Status .OK ,
261+ message = f"{ stats_str } in the last { disp_window (self .WINDOW )} " ,
262+ )
263+
264+ return Outcome (
265+ instance = self ,
266+ ** outcome ,
267+ )
268+
269+ def get_resolving_actions (self , outcome : Outcome ) -> str :
270+ # Adjust the explanation slightly
271+ return f"""# __CODE__ — { self .MODULE_NAME } Simulation Task Failed or Delayed
272+
273+ This check verifies the status of the last background task run for the { self .MODULE_NAME } simulation.
274+ The background task simulates a file generation via WebSocket to verify:
275+ - The WebSocket server is reachable.
276+ - The FSM works correctly.
277+ - The FastAPI modules server is reachable.
278+ - The file generation LLM is reachable.
279+ - The file storage is reachable.
280+
281+ ## Possible Causes for ERROR/WARNING:
282+
283+ - **Network Connectivity:** Issues connecting to the WebSocket server, module server, LLM, or storage.
284+ - **Base File Missing:** The required input file (`health_check_files/...`) might be missing from storage.
285+ - **Module/FSM Logic Error:** An error within the specific module's logic or the FSM definition.
286+ - **Resource Exhaustion:** The simulation task might be timing out due to resource limits (CPU, RAM).
287+ - The LLM API keys might be invalid or the LLM provider is down.
288+ """
289+
290+ def suggest_reboot (self , outcome : Outcome ) -> Sequence [str ]:
291+ return ["fsm" , "module server" ]
292+
293+
294+ class Module1Simulation (ModuleSimulationBase ):
295+ """
296+ Simulates a file generation with module 1 of the chatbot to check if
297+ WebSocket connection, message processing and file generation are working correctly.
298+ """
299+
300+ MODULE_NUMBER = 1
301+ MODULE_NAME = "Info2ArticleXia"
302+
303+
304+ class Module2Simulation (ModuleSimulationBase ):
305+ """
306+ Simulates a file generation with module 2 of the chatbot to check if
307+ WebSocket connection, message processing and file generation are working correctly.
308+ """
309+
310+ MODULE_NUMBER = 2
311+ MODULE_NAME = "TopicsIndexGenXia"
312+
313+
314+ class Module3Simulation (ModuleSimulationBase ):
315+ """
316+ Simulates a file generation with module 3 of the chatbot to check if
317+ WebSocket connection, message processing and file generation are working correctly.
318+ """
319+
320+ MODULE_NUMBER = 3
321+ MODULE_NAME = "ColAgreeSumXia"
322+
323+
324+ class LLMCheck (HealthCheck ):
325+ """
326+ Validates that the enabled LLM are working correctly.
327+ """
328+
329+ WINDOW = dict (hours = 1 )
330+
331+ def get_name (self ) -> str :
332+ return "LLM Check"
333+
334+ def get_status (self ) -> Outcome :
335+ events = Event .objects .types (["llm_call_complete" , "llm_call_start" ]).within (
336+ ** self .WINDOW
337+ )
338+ stats = events .stats ()
339+ stats_str = disp_stats (stats )
340+
341+ if stats ["failure" ]:
342+ errors = [e .data for e in events .filter (is_success = False )]
343+ return Outcome (
344+ instance = self ,
345+ status = Status .ERROR ,
346+ message = f"{ stats_str } in the last { disp_window (self .WINDOW )} " ,
347+ extra = {"errors" : errors },
348+ )
349+ else :
350+ return Outcome (
351+ instance = self ,
352+ status = Status .OK ,
353+ message = f"{ stats_str } in the last { disp_window (self .WINDOW )} " ,
354+ )
355+
356+ def get_resolving_actions (self , outcome : Outcome ) -> str :
357+ return """# __CODE__ — LLM failed
358+
359+ This check validates that the enabled LLM are working correctly.
360+
361+ ## Possible causes
362+
363+ - The API key is invalid.
364+ - The defined endpoint url is invalid.
365+ - The model provider is down.
366+ """
367+
368+ def suggest_reboot (self , outcome : Outcome ) -> Sequence [str ]:
369+ return []
0 commit comments