|
57 | 57 | #include <iostream> |
58 | 58 | #include <lib/clara.hpp> |
59 | 59 | #include <optional> |
| 60 | +#include <thread> |
60 | 61 |
|
61 | 62 | namespace stellar |
62 | 63 | { |
@@ -1607,6 +1608,121 @@ runReportLastHistoryCheckpoint(CommandLineArgs const& args) |
1607 | 1608 | }); |
1608 | 1609 | } |
1609 | 1610 |
|
| 1611 | +namespace |
| 1612 | +{ |
| 1613 | +// Before starting the application, we want to check that the host machine meets |
| 1614 | +// the minimum system requirements we document (16 GiB RAM, 8 vCPUs). To make |
| 1615 | +// the check meaningful, we prevent the node from starting when it fails |
| 1616 | +// (instead of, e.g., writing a log message that may go unread). In the case |
| 1617 | +// that we aren't able to automatically determine the relevant system |
| 1618 | +// information, we allow the operator to set config values with their system |
| 1619 | +// information. Note: we intentionally have these as integer values instead of |
| 1620 | +// booleans so that there isn't a silent failure if we bump the minimum |
| 1621 | +// requirements. If the auto-detected (or operator-provided, in the case of |
| 1622 | +// auto-detection failure) system information doesn't meet the minimum |
| 1623 | +// requirements, we require the operator to set an additional config value to |
| 1624 | +// explicitly acknowledge that they are ignoring the warning. |
| 1625 | + |
| 1626 | +// We want to make the flag value annoying enough to set that the operators have |
| 1627 | +// to make an intentional and continuous decision to ignore the warning. We use |
| 1628 | +// the version to make sure that every time they upgrade the package, they have |
| 1629 | +// to make a new decision to ignore the warning, and we use the public key to |
| 1630 | +// make sure that the value is unique per node. Notably, we don't use something |
| 1631 | +// that depends on the current time so that restarts after crashes are handled |
| 1632 | +// gracefully (assuming the package wasn't upgraded in between). |
| 1633 | +bool |
| 1634 | +validateSystemInfo(Config const& cfg) |
| 1635 | +{ |
| 1636 | + std::string annoyingValue = |
| 1637 | + fmt::format(FMT_STRING("{}-{}"), STELLAR_CORE_VERSION, |
| 1638 | + KeyUtils::toStrKey(cfg.NODE_SEED.getPublicKey())); |
| 1639 | + |
| 1640 | + uint64_t memory = rust_bridge::get_host_total_memory(); |
| 1641 | + if (memory == 0) |
| 1642 | + { |
| 1643 | + if (!cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT) |
| 1644 | + { |
| 1645 | + LOG_ERROR(DEFAULT_LOG, |
| 1646 | + "Unable to determine total memory of the host; please " |
| 1647 | + "ensure that the system has at least 16 GiB of RAM. Once " |
| 1648 | + "confirmed, set SYSCHECK_UNKNOWN_MEMORY_DEFAULT to the " |
| 1649 | + "size of RAM in KiB."); |
| 1650 | + return false; |
| 1651 | + } |
| 1652 | + |
| 1653 | + LOG_WARNING(DEFAULT_LOG, |
| 1654 | + "Unable to determine total memory of the host; using " |
| 1655 | + "SYSCHECK_UNKNOWN_MEMORY_DEFAULT value of {} KiB for " |
| 1656 | + "checks. Please ensure this is still the correct value.", |
| 1657 | + cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT); |
| 1658 | + |
| 1659 | + memory = cfg.SYSCHECK_UNKNOWN_MEMORY_DEFAULT; |
| 1660 | + } |
| 1661 | + |
| 1662 | + if (memory < static_cast<uint32_t>(16) * 1024 * 1024) |
| 1663 | + { |
| 1664 | + if (cfg.SYSCHECK_FORCE_IGNORE_MEMORY != annoyingValue) |
| 1665 | + { |
| 1666 | + LOG_ERROR( |
| 1667 | + DEFAULT_LOG, |
| 1668 | + "Host only has {} KiB of RAM; stellar-core may not function " |
| 1669 | + "properly under heavy load; please ensure that the system has " |
| 1670 | + "at least 16 GiB of RAM. To force ignore this warning, set " |
| 1671 | + "SYSCHECK_FORCE_IGNORE_MEMORY to \"{}\". Note that this value " |
| 1672 | + "differs for every node and version.", |
| 1673 | + memory, annoyingValue); |
| 1674 | + return false; |
| 1675 | + } |
| 1676 | + LOG_WARNING( |
| 1677 | + DEFAULT_LOG, |
| 1678 | + "Host only has {} KiB of RAM; the recommended minimum is 16 GiB", |
| 1679 | + memory); |
| 1680 | + } |
| 1681 | + |
| 1682 | + unsigned int cpus = std::thread::hardware_concurrency(); |
| 1683 | + if (cpus == 0) |
| 1684 | + { |
| 1685 | + if (!cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT) |
| 1686 | + { |
| 1687 | + LOG_ERROR( |
| 1688 | + DEFAULT_LOG, |
| 1689 | + "Unable to determine number of vCPUs of the host; please " |
| 1690 | + "ensure that the system has at least 8 vCPUs. Once confirmed, " |
| 1691 | + "set SYSCHECK_UNKNOWN_CPU_DEFAULT to the number of vCPUs."); |
| 1692 | + return false; |
| 1693 | + } |
| 1694 | + |
| 1695 | + LOG_WARNING(DEFAULT_LOG, |
| 1696 | + "Unable to determine number of vCPUs of the host; using " |
| 1697 | + "SYSCHECK_UNKNOWN_CPU_DEFAULT value of {} for checks. " |
| 1698 | + "Please ensure this is still the correct value.", |
| 1699 | + cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT); |
| 1700 | + |
| 1701 | + cpus = cfg.SYSCHECK_UNKNOWN_CPU_DEFAULT; |
| 1702 | + } |
| 1703 | + |
| 1704 | + if (cpus < 8) |
| 1705 | + { |
| 1706 | + if (cfg.SYSCHECK_FORCE_IGNORE_CPU != annoyingValue) |
| 1707 | + { |
| 1708 | + LOG_ERROR(DEFAULT_LOG, |
| 1709 | + "Host only has {} vCPUs; stellar-core may not function " |
| 1710 | + "properly under heavy load; please ensure that the " |
| 1711 | + "system has at least 8 vCPUs. To force ignore this " |
| 1712 | + "warning, set SYSCHECK_FORCE_IGNORE_CPU to \"{}\". Note " |
| 1713 | + "that value differs for every node and version.", |
| 1714 | + cpus, annoyingValue); |
| 1715 | + return false; |
| 1716 | + } |
| 1717 | + LOG_WARNING(DEFAULT_LOG, |
| 1718 | + "Host only has {} vCPUs; the recommended minimum is 8", |
| 1719 | + cpus); |
| 1720 | + } |
| 1721 | + |
| 1722 | + return true; |
| 1723 | +} |
| 1724 | +} // namespace |
| 1725 | + |
1610 | 1726 | int |
1611 | 1727 | run(CommandLineArgs const& args) |
1612 | 1728 | { |
@@ -1682,6 +1798,16 @@ run(CommandLineArgs const& args) |
1682 | 1798 | "enabled (for testing only)"); |
1683 | 1799 | } |
1684 | 1800 |
|
| 1801 | + if (gIsProductionNetwork && cfg.NODE_IS_VALIDATOR && |
| 1802 | + !validateSystemInfo(cfg)) |
| 1803 | + { |
| 1804 | + LOG_ERROR( |
| 1805 | + DEFAULT_LOG, |
| 1806 | + "Host system does not meet the minimum requirements " |
| 1807 | + "for running stellar-core. Exiting."); |
| 1808 | + return 1; |
| 1809 | + } |
| 1810 | + |
1685 | 1811 | // Second, setup the app with the final configuration. |
1686 | 1812 | clock = std::make_shared<VirtualClock>(clockMode); |
1687 | 1813 | app = setupApp(cfg, *clock); |
|
0 commit comments