Skip to content

Commit 1b955c0

Browse files
committed
Check # CPUs + memory size on startup
1 parent d25bb5c commit 1b955c0

File tree

7 files changed

+166
-0
lines changed

7 files changed

+166
-0
lines changed

Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/main/CommandLine.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include <iostream>
5858
#include <lib/clara.hpp>
5959
#include <optional>
60+
#include <thread>
6061

6162
namespace stellar
6263
{
@@ -1607,6 +1608,121 @@ runReportLastHistoryCheckpoint(CommandLineArgs const& args)
16071608
});
16081609
}
16091610

1611+
namespace
1612+
{
1613+
// Before starting the application, we want to check that the host machine meets
1614+
// the minimum system requirements we document (16 GiB RAM, 8 vCPUs). To make
1615+
// the check meaningful, we prevent the node from starting when it fails
1616+
// (instead of, e.g., writing a log message that may go unread). In the case
1617+
// that we aren't able to automatically determine the relevant system
1618+
// information, we allow the operator to set config values with their system
1619+
// information. Note: we intentionally have these as integer values instead of
1620+
// booleans so that there isn't a silent failure if we bump the minimum
1621+
// requirements. If the auto-detected (or operator-provided, in the case of
1622+
// auto-detection failure) system information doesn't meet the minimum
1623+
// requirements, we require the operator to set an additional config value to
1624+
// explicitly acknowledge that they are ignoring the warning.
1625+
1626+
// We want to make the flag value annoying enough to set that the operators have
1627+
// to make an intentional and continuous decision to ignore the warning. We use
1628+
// the version to make sure that every time they upgrade the package, they have
1629+
// to make a new decision to ignore the warning, and we use the public key to
1630+
// make sure that the value is unique per node. Notably, we don't use something
1631+
// that depends on the current time so that restarts after crashes are handled
1632+
// gracefully (assuming the package wasn't upgraded in between).
1633+
bool
1634+
validateSystemInfo(Config const& cfg)
1635+
{
1636+
std::string annoyingValue =
1637+
fmt::format(FMT_STRING("{}-{}"), STELLAR_CORE_VERSION,
1638+
KeyUtils::toStrKey(cfg.NODE_SEED.getPublicKey()));
1639+
1640+
uint64_t memory = rust_bridge::get_host_total_memory();
1641+
if (memory == 0)
1642+
{
1643+
if (!cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT)
1644+
{
1645+
LOG_ERROR(DEFAULT_LOG,
1646+
"Unable to determine total memory of the host; please "
1647+
"ensure that the system has at least 16 GiB of RAM. Once "
1648+
"confirmed, set SYSCHECK_UNKNOWN_MEMORY_DEFAULT to the "
1649+
"size of RAM in KiB.");
1650+
return false;
1651+
}
1652+
1653+
LOG_WARNING(DEFAULT_LOG,
1654+
"Unable to determine total memory of the host; using "
1655+
"SYSCHECK_UNKNOWN_MEMORY_DEFAULT value of {} KiB for "
1656+
"checks. Please ensure this is still the correct value.",
1657+
cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT);
1658+
1659+
memory = cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT;
1660+
}
1661+
1662+
if (memory < static_cast<uint32_t>(16) * 1024 * 1024)
1663+
{
1664+
if (cfg.SYSCHECK_FORCE_IGNORE_MEMORY != annoyingValue)
1665+
{
1666+
LOG_ERROR(
1667+
DEFAULT_LOG,
1668+
"Host only has {} KiB of RAM; stellar-core may not function "
1669+
"properly under heavy load; please ensure that the system has "
1670+
"at least 16 GiB of RAM. To force ignore this warning, set "
1671+
"SYSCHECK_FORCE_IGNORE_MEMORY to \"{}\". Note that this value "
1672+
"differs for every node and version.",
1673+
memory, annoyingValue);
1674+
return false;
1675+
}
1676+
LOG_WARNING(
1677+
DEFAULT_LOG,
1678+
"Host only has {} KiB of RAM; the recommended minimum is 16 GiB",
1679+
memory);
1680+
}
1681+
1682+
unsigned int cpus = std::thread::hardware_concurrency();
1683+
if (cpus == 0)
1684+
{
1685+
if (!cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT)
1686+
{
1687+
LOG_ERROR(
1688+
DEFAULT_LOG,
1689+
"Unable to determine number of vCPUs of the host; please "
1690+
"ensure that the system has at least 8 vCPUs. Once confirmed, "
1691+
"set SYSCHECK_UNKNOWN_CPU_DEFAULT to the number of vCPUs.");
1692+
return false;
1693+
}
1694+
1695+
LOG_WARNING(DEFAULT_LOG,
1696+
"Unable to determine number of vCPUs of the host; using "
1697+
"SYSCHECK_UNKNOWN_CPU_DEFAULT value of {} for checks. "
1698+
"Please ensure this is still the correct value.",
1699+
cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT);
1700+
1701+
cpus = cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT;
1702+
}
1703+
1704+
if (cpus < 8)
1705+
{
1706+
if (cfg.SYSCHECK_FORCE_IGNORE_CPU != annoyingValue)
1707+
{
1708+
LOG_ERROR(DEFAULT_LOG,
1709+
"Host only has {} vCPUs; stellar-core may not function "
1710+
"properly under heavy load; please ensure that the "
1711+
"system has at least 8 vCPUs. To force ignore this "
1712+
"warning, set SYSCHECK_FORCE_IGNORE_CPU to \"{}\". Note "
1713+
"that value differs for every node and version.",
1714+
cpus, annoyingValue);
1715+
return false;
1716+
}
1717+
LOG_WARNING(DEFAULT_LOG,
1718+
"Host only has {} vCPUs; the recommended minimum is 8",
1719+
cpus);
1720+
}
1721+
1722+
return true;
1723+
}
1724+
} // namespace
1725+
16101726
int
16111727
run(CommandLineArgs const& args)
16121728
{
@@ -1682,6 +1798,16 @@ run(CommandLineArgs const& args)
16821798
"enabled (for testing only)");
16831799
}
16841800

1801+
if (gIsProductionNetwork && cfg.NODE_IS_VALIDATOR &&
1802+
!validateSystemInfo(cfg))
1803+
{
1804+
LOG_ERROR(
1805+
DEFAULT_LOG,
1806+
"Host system does not meet the minimum requirements "
1807+
"for running stellar-core. Exiting.");
1808+
return 1;
1809+
}
1810+
16851811
// Second, setup the app with the final configuration.
16861812
clock = std::make_shared<VirtualClock>(clockMode);
16871813
app = setupApp(cfg, *clock);

src/main/Config.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,18 @@ Config::processConfig(std::shared_ptr<cpptoml::table> t)
12551255
{"LOG_COLOR", [&]() { LOG_COLOR = readBool(item); }},
12561256
{"BUCKET_DIR_PATH",
12571257
[&]() { BUCKET_DIR_PATH = readString(item); }},
1258+
{"SYSCHECK_UNKNOWN_MEMORY_DEFAULT",
1259+
[&]() {
1260+
SYSCHECK_UNKNOWN_MEMORY_DEFAULT = readInt<uint32_t>(item);
1261+
}},
1262+
{"SYSCHECK_UNKNOWN_CPU_DEFAULT",
1263+
[&]() {
1264+
SYSCHECK_UNKNOWN_CPU_DEFAULT = readInt<uint32_t>(item);
1265+
}},
1266+
{"SYSCHECK_FORCE_IGNORE_MEMORY",
1267+
[&]() { SYSCHECK_FORCE_IGNORE_MEMORY = readString(item); }},
1268+
{"SYSCHECK_FORCE_IGNORE_CPU",
1269+
[&]() { SYSCHECK_FORCE_IGNORE_CPU = readString(item); }},
12581270
{"FILTERED_SOROBAN_KEYS_PATH",
12591271
[&]() {
12601272
LOG_WARNING(DEFAULT_LOG,

src/main/Config.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,14 @@ class Config : public std::enable_shared_from_this<Config>
672672
bool LOG_COLOR;
673673
std::string BUCKET_DIR_PATH;
674674

675+
// Config parameters controlling startup system info validation (only
676+
// relevant for mainnet validators). See the comment above
677+
// validateSystemInfo in CommandLine.cpp for details.
678+
uint32_t SYSCHECK_UNKNOWN_MEMORY_DEFAULT{0};
679+
uint32_t SYSCHECK_UNKNOWN_CPU_DEFAULT{0};
680+
std::string SYSCHECK_FORCE_IGNORE_MEMORY;
681+
std::string SYSCHECK_FORCE_IGNORE_CPU;
682+
675683
// Path to Protocol 23 corruption CSV file for testing/recovery
676684
std::string PATH_TO_PROTOCOL_23_CORRUPTION_FILE;
677685

src/rust/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ rustc-simple-version = "=0.1.0"
1717
# NB: this must match the same rand version used by soroban (but the tooling
1818
# will complain if it does not match)
1919
rand = "=0.8.5"
20+
sys-info = "=0.9.1"
2021

2122
itertools = "=0.10.5"
2223
backtrace = { version = "=0.3.76", features = [ "cpp_demangle" ] }

src/rust/src/bridge.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,10 @@ pub(crate) mod rust_bridge {
251251
// Check to see if the XDR files used by different rust dependencies match.
252252
fn check_xdr_version_identities() -> Result<()>;
253253

254+
// Get the total memory available on the host machine, in kibibytes.
255+
// Returns 0 if the value cannot be obtained for any reason.
256+
fn get_host_total_memory() -> u64;
257+
254258
// Computes the resource fee given the transaction resource consumption
255259
// and network configuration.
256260
fn compute_transaction_resource_fee(

src/rust/src/common.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,7 @@ pub(crate) fn check_xdr_version_identities() -> Result<(), Box<dyn std::error::E
120120
// Add more comparisons between XDR file lists as needed
121121
Ok(())
122122
}
123+
124+
pub(crate) fn get_host_total_memory() -> u64 {
125+
sys_info::mem_info().map(|mem| mem.total).unwrap_or(0)
126+
}

0 commit comments

Comments
 (0)