From 9a1b095a817f6c0b11e68a386fbc8fab466fa98c Mon Sep 17 00:00:00 2001 From: Sylvain Chapeland Date: Wed, 26 Feb 2025 10:25:02 +0100 Subject: [PATCH 1/6] ignore sigpipe --- src/mainReadout.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mainReadout.cxx b/src/mainReadout.cxx index 9b383ad..18263fa 100644 --- a/src/mainReadout.cxx +++ b/src/mainReadout.cxx @@ -573,6 +573,7 @@ int Readout::_init(int argc, char* argv[]) sigaction(SIGTERM, &signalSettings, NULL); sigaction(SIGQUIT, &signalSettings, NULL); sigaction(SIGINT, &signalSettings, NULL); + signal(SIGPIPE, SIG_IGN); // log startup and options theLog.log(LogInfoSupport_(3001), "Readout " READOUT_VERSION " - process starting, pid %d for role %s", getpid(), occRole.c_str()); From 4f6d1f6febd1f5c9f5606668851fa9b6f5b1c561 Mon Sep 17 00:00:00 2001 From: Sylvain Chapeland Date: Wed, 26 Feb 2025 10:25:31 +0100 Subject: [PATCH 2/6] TFperiod default changed to 32 --- doc/configurationParameters.md | 2 +- doc/releaseNotes.md | 4 ++++ src/ReadoutEquipment.cxx | 4 ++-- src/ReadoutEquipment.h | 2 +- src/readRaw.cxx | 2 +- src/readoutConfigEditor.tcl | 2 +- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/doc/configurationParameters.md b/doc/configurationParameters.md index 915fb05..2a2d665 100644 --- a/doc/configurationParameters.md +++ b/doc/configurationParameters.md @@ -161,7 +161,7 @@ The parameters related to 3rd-party libraries are described here for convenience | equipment-* | saveErrorPagesMax | int | 0 | If set, pages found with data error are saved to disk up to given maximum. | | equipment-* | saveErrorPagesPath | string | | Path where to save data pages with errors (when feature enabled). | | equipment-* | stopOnError | int | 0 | If 1, readout will stop automatically on equipment error. | -| equipment-* | TFperiod | int | 128 | Duration of a timeframe, in number of LHC orbits. | +| equipment-* | TFperiod | int | 32 | Duration of a timeframe, in number of LHC orbits. | | equipment-* | verbose | int | 0 | If set, extra debug messages may be logged. | | equipment-cruemulator-* | cruBlockSize | int | 8192 | Size of a RDH block. | | equipment-cruemulator-* | cruId | int | 0 | CRU Id, used for CRU Id field in RDH. | diff --git a/doc/releaseNotes.md b/doc/releaseNotes.md index 04af9f2..f26366b 100644 --- a/doc/releaseNotes.md +++ b/doc/releaseNotes.md @@ -655,3 +655,7 @@ This file describes the main feature changes for each readout.exe released versi ## v2.27.1 - 12/02/2025 - Path of configuration file from which defaults are loaded on startup has been changed to /etc/o2.d/readout/readout-defaults.cfg (it was in /etc/o2.d before, now changing to standard subdirectory location). + +## next version +- Updated configuration parameters: + - equipment.TFperiod is now set to 32 by default, instead of 128 previously. This is the duration of a timeframe, in number of LHC orbits. The new value corresponds to what is used in production. diff --git a/src/ReadoutEquipment.cxx b/src/ReadoutEquipment.cxx index 14ec3c4..aa90f86 100644 --- a/src/ReadoutEquipment.cxx +++ b/src/ReadoutEquipment.cxx @@ -161,8 +161,8 @@ ReadoutEquipment::ReadoutEquipment(ConfigFile& cfg, std::string cfgEntryPoint, b cfg.getOptionalValue(cfgEntryPoint + ".verbose", cfgVerbose); if (!cfgDisableTimeframes) { - // configuration parameter: | equipment-* | TFperiod | int | 128 | Duration of a timeframe, in number of LHC orbits. | - int cfgTFperiod = 128; + // configuration parameter: | equipment-* | TFperiod | int | 32 | Duration of a timeframe, in number of LHC orbits. | + int cfgTFperiod = 32; cfg.getOptionalValue(cfgEntryPoint + ".TFperiod", cfgTFperiod); timeframePeriodOrbits = cfgTFperiod; diff --git a/src/ReadoutEquipment.h b/src/ReadoutEquipment.h index 8d69787..4db44df 100644 --- a/src/ReadoutEquipment.h +++ b/src/ReadoutEquipment.h @@ -165,7 +165,7 @@ class ReadoutEquipment //const unsigned int LHCBunches = 3564; // number of bunches in LHC const unsigned int LHCOrbitRate = 11246; // LHC orbit rate, in Hz. 299792458 / 26659 - uint32_t timeframePeriodOrbits = 128; // timeframe interval duration in number of LHC orbits + uint32_t timeframePeriodOrbits = 32; // timeframe interval duration in number of LHC orbits double timeframeRate = 0; // timeframe rate, when generated internally // RDH-related configuration parameters diff --git a/src/readRaw.cxx b/src/readRaw.cxx index a7574b9..7cf70f3 100644 --- a/src/readRaw.cxx +++ b/src/readRaw.cxx @@ -76,7 +76,7 @@ int main(int argc, const char* argv[]) " dumpDataInline=(int) : if set, each packet raw content is printed (hex dump style).\n" " dumpStats=(int) : if set, some statistics are printed on HBF/TF size.\n" " fileReadVerbose=(int) : if set, more information is printed when reading/decoding file.\n" - " timeframePeriodOrbits=(int) : if set, TF id computed (and printed, when dump enabled) for each RDH. Typically, 128 or 256.\n" + " timeframePeriodOrbits=(int) : if set, TF id computed (and printed, when dump enabled) for each RDH. Typically, 32 or 128.\n" " logOff=(int) : if set, logs disabled.\n" " dumpOrbitStats=(int) : if set, first / min / max orbits are printed after file read.\n" " \n", diff --git a/src/readoutConfigEditor.tcl b/src/readoutConfigEditor.tcl index 232779d..0b5a7ea 100755 --- a/src/readoutConfigEditor.tcl +++ b/src/readoutConfigEditor.tcl @@ -107,7 +107,7 @@ set configurationParametersDescriptor { | equipment-* | saveErrorPagesMax | int | 0 | If set, pages found with data error are saved to disk up to given maximum. | | equipment-* | saveErrorPagesPath | string | | Path where to save data pages with errors (when feature enabled). | | equipment-* | stopOnError | int | 0 | If 1, readout will stop automatically on equipment error. | -| equipment-* | TFperiod | int | 128 | Duration of a timeframe, in number of LHC orbits. | +| equipment-* | TFperiod | int | 32 | Duration of a timeframe, in number of LHC orbits. | | equipment-* | verbose | int | 0 | If set, extra debug messages may be logged. | | equipment-cruemulator-* | cruBlockSize | int | 8192 | Size of a RDH block. | | equipment-cruemulator-* | cruId | int | 0 | CRU Id, used for CRU Id field in RDH. | From 22ab99ed6508843e935b311264a11a687e7b9268 Mon Sep 17 00:00:00 2001 From: Sylvain Chapeland Date: Wed, 19 Mar 2025 09:53:53 +0100 Subject: [PATCH 3/6] changed test settings --- src/testROC.cxx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/testROC.cxx b/src/testROC.cxx index 8201ce6..a2066aa 100644 --- a/src/testROC.cxx +++ b/src/testROC.cxx @@ -57,10 +57,11 @@ class ROCdevice private: std::string bankId = "testROC"; - size_t bankSize = 2 * 1024 * 1024 * 1024L; + size_t bankSize = 1024 * 1024 * 1024L; // buffer siye - size_t memoryPoolNumberOfPages = 1000; size_t memoryPoolPageSize = 2 * 1024 * 1024; + size_t memoryPoolNumberOfPages = 10 * int((bankSize / memoryPoolPageSize) / 10); // number of pages fitting in memory, round down to ten + const char* memoryPoolType = "malloc"; std::string cardId = "0:0.0"; int cfgChannelNumber = 0; @@ -80,7 +81,7 @@ ROCdevice::ROCdevice(std::string id) bankId += id; - bank = getMemoryBank(bankSize, "MemoryMappedFile", bankId); + bank = getMemoryBank(bankSize, memoryPoolType, bankId); theMemoryBankManager.addBank(bank); mp = theMemoryBankManager.getPagedPool(memoryPoolPageSize, memoryPoolNumberOfPages, bankId); // pool of pages @@ -91,6 +92,7 @@ ROCdevice::ROCdevice(std::string id) params.setCardId(AliceO2::roc::Parameters::cardIdFromString(cardId)); params.setChannelNumber(cfgChannelNumber); params.setDataSource(AliceO2::roc::DataSource::fromString(cfgDataSource)); + params.setFirmwareCheckEnabled(0); params.setBufferParameters(AliceO2::roc::buffer_parameters::Memory{ mp->getBaseBlockAddress(), mp->getBaseBlockSize() }); @@ -106,6 +108,8 @@ ROCdevice::ROCdevice(std::string id) std::string infoFirmwareVersion = channel->getFirmwareInfo().value_or("unknown"); std::string infoCardId = channel->getCardId().value_or("unknown"); theLog.log(LogInfoDevel_(3010), "ROC PCI %s @ NUMA node %d, serial number %s, firmware version %s, card id %s", infoPciAddress.c_str(), infoNumaNode, infoSerialNumber.c_str(), infoFirmwareVersion.c_str(), infoCardId.c_str()); + // test getCounterFirstOrbit() + // printf("get = %X\n", (int)(channel->getCounterFirstOrbit())); } ROCdevice::~ROCdevice() {} From 0cd404d125d094de679138f800a4137afae3f0dc Mon Sep 17 00:00:00 2001 From: Sylvain Chapeland Date: Wed, 19 Mar 2025 10:01:09 +0100 Subject: [PATCH 4/6] added a monitoring queue to push spontaneous measurements --- CMakeLists.txt | 1 + src/ConsumerStats.cxx | 7 ++++++ src/ReadoutMonitoringQueue.cxx | 36 +++++++++++++++++++++++++++++ src/ReadoutMonitoringQueue.h | 42 ++++++++++++++++++++++++++++++++++ src/mainReadout.cxx | 4 ++++ 5 files changed, 90 insertions(+) create mode 100644 src/ReadoutMonitoringQueue.cxx create mode 100644 src/ReadoutMonitoringQueue.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 74b54e5..e4e9d08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,6 +258,7 @@ add_library( ${SOURCE_DIR}/MemoryBank.cxx ${SOURCE_DIR}/MemoryBankManager.cxx ${SOURCE_DIR}/MemoryPagesPool.cxx + ${SOURCE_DIR}/ReadoutMonitoringQueue.cxx $<$:${SOURCE_DIR}/ZmqServer.cxx> $<$:${SOURCE_DIR}/ZmqClient.cxx> ) diff --git a/src/ConsumerStats.cxx b/src/ConsumerStats.cxx index e646a44..780219f 100644 --- a/src/ConsumerStats.cxx +++ b/src/ConsumerStats.cxx @@ -23,6 +23,7 @@ #include "DataSet.h" #include "ReadoutUtils.h" #include "ReadoutStats.h" +#include "ReadoutMonitoringQueue.h" using namespace o2::monitoring; @@ -182,6 +183,12 @@ class ConsumerStats : public Consumer sendMetricNoException(Metric{"readout.bufferUsage"}.addValue((int)(r*100), "value").addValue(b, "bytes").addTag(tags::Key::ID, i)); } } + + // publish measurements stored in monitoring queue + auto ff = [&] (const ReadoutMonitoringMetric &m) -> void { + sendMetricNoException(Metric{m.value, m.name}.addTag(tags::Key::ID, m.tag)); + }; + gReadoutMonitoringQueue.execute(ff); } #ifdef WITH_ZMQ diff --git a/src/ReadoutMonitoringQueue.cxx b/src/ReadoutMonitoringQueue.cxx new file mode 100644 index 0000000..c705579 --- /dev/null +++ b/src/ReadoutMonitoringQueue.cxx @@ -0,0 +1,36 @@ +#include "ReadoutMonitoringQueue.h" + +ReadoutMonitoringQueue::ReadoutMonitoringQueue() { +} + +ReadoutMonitoringQueue::~ReadoutMonitoringQueue() { +} + +void ReadoutMonitoringQueue::push(ReadoutMonitoringMetric m) { + std::unique_lock lock(qMutex); + q.push_front(std::move(m)); +} + +void ReadoutMonitoringQueue::execute(std::function f) { + for (;;) { + ReadoutMonitoringMetric m; + + { + std::unique_lock lock(qMutex); + if (q.empty()) { + break; + } + m = std::move(q.back()); + q.pop_back(); + } + + f(m); + } +} + +void ReadoutMonitoringQueue::clear() { + std::unique_lock lock(qMutex); + q.clear(); +} + +ReadoutMonitoringQueue gReadoutMonitoringQueue; diff --git a/src/ReadoutMonitoringQueue.h b/src/ReadoutMonitoringQueue.h new file mode 100644 index 0000000..f4c2a13 --- /dev/null +++ b/src/ReadoutMonitoringQueue.h @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +// a metric to be stored in queue for later processing +// fields as for o2::monitoring::metric +struct ReadoutMonitoringMetric { + std::string name; + unsigned short int tag; + uint64_t value; +}; + + +// producer-consumer queue to define and publish metrics +// typical use: +// the module that pushes has no access to o2 Monitoring +// the module that publishes reads from the queue and publish them to o2 Monitoring +// the class is not aware of o2::monitoring, it's just a transient thread-safe storage + +class ReadoutMonitoringQueue { + public: + + ReadoutMonitoringQueue(); + ~ReadoutMonitoringQueue(); + + // push an element in the queue + void push(ReadoutMonitoringMetric); + + // execute provided functions on all elements in the queue + // (and remove them from the queue) + void execute(std::function); + + // remove all elements in queue + void clear(); + + private: + std::mutex qMutex; + std::deque q; +}; + +extern ReadoutMonitoringQueue gReadoutMonitoringQueue; diff --git a/src/mainReadout.cxx b/src/mainReadout.cxx index 18263fa..792936a 100644 --- a/src/mainReadout.cxx +++ b/src/mainReadout.cxx @@ -79,6 +79,7 @@ #include "ReadoutVersion.h" #include "TtyChecker.h" #include "ReadoutConst.h" +#include "ReadoutMonitoringQueue.h" #ifdef WITH_NUMA #include @@ -774,6 +775,9 @@ int Readout::_configure(const boost::property_tree::ptree& properties) // reset some flags gReadoutStats.isFairMQ = 0; // disable FMQ stats + // reset monitoring queue + gReadoutMonitoringQueue.clear(); + // load configuration file theLog.log(LogInfoSupport, "Reading configuration from %s %s", cfgFileURI, cfgFileEntryPoint); From 9d83f3d5c61bd27abcb077f41d29c58913b0ab2d Mon Sep 17 00:00:00 2001 From: Sylvain Chapeland Date: Wed, 19 Mar 2025 10:07:08 +0100 Subject: [PATCH 5/6] added monitorFirstOrbitEnabled --- doc/configurationParameters.md | 1 + doc/releaseNotes.md | 2 ++ src/ReadoutEquipmentRORC.cxx | 10 ++++++++++ src/readoutConfigEditor.tcl | 1 + 4 files changed, 14 insertions(+) diff --git a/doc/configurationParameters.md b/doc/configurationParameters.md index 2a2d665..425f537 100644 --- a/doc/configurationParameters.md +++ b/doc/configurationParameters.md @@ -194,6 +194,7 @@ The parameters related to 3rd-party libraries are described here for convenience | equipment-rorc-* | firmwareCheckEnabled | int | 1 | If set, RORC driver checks compatibility with detected firmware. Use 0 to bypass this check (eg new fw version not yet recognized by ReadoutCard version). | | equipment-rorc-* | firmwareVersionsAllowed | string | | Comma-separated list of ROC firmware versions allowed (6-digit hash). If empty, all are allowed. | | equipment-rorc-* | firmwareVersionsDenied | string | e4a5a46e | Comma-separated list of ROC firmware versions denied (6-digit hash), i.e. which would cause configuration to abort. | +| equipment-rorc-* | monitorFirstOrbitEnabled | int | 0 | If set, enable monitoring of RORC first orbit. | | equipment-zmq-* | address | string | | Address of remote server to connect, eg tcp://remoteHost:12345. | | equipment-zmq-* | mode | string | stream | Possible values: stream (1 input ZMQ message = 1 output data page), snapshot (last ZMQ message = one output data page per TF). | | equipment-zmq-* | timeframeClientUrl | string | | The address to be used to retrieve current timeframe. When set, data is published only once for each TF id published by remote server. | diff --git a/doc/releaseNotes.md b/doc/releaseNotes.md index f26366b..a215833 100644 --- a/doc/releaseNotes.md +++ b/doc/releaseNotes.md @@ -659,3 +659,5 @@ This file describes the main feature changes for each readout.exe released versi ## next version - Updated configuration parameters: - equipment.TFperiod is now set to 32 by default, instead of 128 previously. This is the duration of a timeframe, in number of LHC orbits. The new value corresponds to what is used in production. + - equipment-rorc-*: added parameter monitorFirstOrbitEnabled, to enable reporting to monitoring system the metric "readout.RORCfirstOrbit" on startup, as retrieved from CRU firmware. This is to be used on CTP FLP to detect possibly wrong orbit. A return value of 0xFFFFFFFF indicates the value could not be retrieved. +- ConsumerStats: added an internal queue to allow pushing spontaneous monitoring measurements (compared to periodic ones) from any readout module. diff --git a/src/ReadoutEquipmentRORC.cxx b/src/ReadoutEquipmentRORC.cxx index 3a47a19..e810804 100644 --- a/src/ReadoutEquipmentRORC.cxx +++ b/src/ReadoutEquipmentRORC.cxx @@ -26,6 +26,7 @@ #include "ReadoutEquipment.h" #include "ReadoutUtils.h" #include "readoutInfoLogger.h" +#include "ReadoutMonitoringQueue.h" class ReadoutEquipmentRORC : public ReadoutEquipment { @@ -131,6 +132,10 @@ ReadoutEquipmentRORC::ReadoutEquipmentRORC(ConfigFile& cfg, std::string name) : // configuration parameter: | equipment-rorc-* | debugStatsEnabled | int | 0 | If set, enable extra statistics about internal buffers status. (printed to stdout when stopping) | cfg.getOptionalValue(name + ".debugStatsEnabled", cfgDebugStatsEnabled); + // configuration parameter: | equipment-rorc-* | monitorFirstOrbitEnabled | int | 0 | If set, enable monitoring of RORC first orbit. | + int cfgMonitorFirstOrbitEnabled = 0; + cfg.getOptionalValue(name + ".monitorFirstOrbitEnabled", cfgMonitorFirstOrbitEnabled); + // get readout memory buffer parameters // std::string sMemorySize=cfg.getValue(name + ".memoryBufferSize"); // std::string sPageSize=cfg.getValue(name + ".memoryPageSize"); long long @@ -216,6 +221,11 @@ ReadoutEquipmentRORC::ReadoutEquipmentRORC(ConfigFile& cfg, std::string name) : BOOST_THROW_EXCEPTION(ReadoutEquipmentRORCException() << ErrorInfo::Message("This firmware version is not allowed")); } + // publish relevant card info + if (cfgMonitorFirstOrbitEnabled) { + gReadoutMonitoringQueue.push({.name = "readout.RORCfirstOrbit", .tag = id, .value = (uint64_t)channel->getCounterFirstOrbit()}); + } + // todo: log parameters ? if (logRocCallsEnable) { diff --git a/src/readoutConfigEditor.tcl b/src/readoutConfigEditor.tcl index 0b5a7ea..653e7e0 100755 --- a/src/readoutConfigEditor.tcl +++ b/src/readoutConfigEditor.tcl @@ -140,6 +140,7 @@ set configurationParametersDescriptor { | equipment-rorc-* | firmwareCheckEnabled | int | 1 | If set, RORC driver checks compatibility with detected firmware. Use 0 to bypass this check (eg new fw version not yet recognized by ReadoutCard version). | | equipment-rorc-* | firmwareVersionsAllowed | string | | Comma-separated list of ROC firmware versions allowed (6-digit hash). If empty, all are allowed. | | equipment-rorc-* | firmwareVersionsDenied | string | e4a5a46e | Comma-separated list of ROC firmware versions denied (6-digit hash), i.e. which would cause configuration to abort. | +| equipment-rorc-* | monitorFirstOrbitEnabled | int | 0 | If set, enable monitoring of RORC first orbit. | | equipment-zmq-* | address | string | | Address of remote server to connect, eg tcp://remoteHost:12345. | | equipment-zmq-* | mode | string | stream | Possible values: stream (1 input ZMQ message = 1 output data page), snapshot (last ZMQ message = one output data page per TF). | | equipment-zmq-* | timeframeClientUrl | string | | The address to be used to retrieve current timeframe. When set, data is published only once for each TF id published by remote server. | From b206dd31ca4d79335b066a81e2e63cc80600a4bf Mon Sep 17 00:00:00 2001 From: Sylvain Chapeland Date: Wed, 19 Mar 2025 10:08:27 +0100 Subject: [PATCH 6/6] v2.27.2 --- doc/releaseNotes.md | 2 +- src/ReadoutVersion.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/releaseNotes.md b/doc/releaseNotes.md index a215833..b7b9202 100644 --- a/doc/releaseNotes.md +++ b/doc/releaseNotes.md @@ -656,7 +656,7 @@ This file describes the main feature changes for each readout.exe released versi ## v2.27.1 - 12/02/2025 - Path of configuration file from which defaults are loaded on startup has been changed to /etc/o2.d/readout/readout-defaults.cfg (it was in /etc/o2.d before, now changing to standard subdirectory location). -## next version +## v2.27.2 - 19/03/2025 - Updated configuration parameters: - equipment.TFperiod is now set to 32 by default, instead of 128 previously. This is the duration of a timeframe, in number of LHC orbits. The new value corresponds to what is used in production. - equipment-rorc-*: added parameter monitorFirstOrbitEnabled, to enable reporting to monitoring system the metric "readout.RORCfirstOrbit" on startup, as retrieved from CRU firmware. This is to be used on CTP FLP to detect possibly wrong orbit. A return value of 0xFFFFFFFF indicates the value could not be retrieved. diff --git a/src/ReadoutVersion.h b/src/ReadoutVersion.h index d2b5cb4..d304e79 100644 --- a/src/ReadoutVersion.h +++ b/src/ReadoutVersion.h @@ -9,5 +9,5 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -#define READOUT_VERSION "2.27.1" +#define READOUT_VERSION "2.27.2"