From 5ce308bc4d4529be922d97164e4c4b4cabe2812e Mon Sep 17 00:00:00 2001 From: Jack Challen Date: Fri, 25 Oct 2024 12:56:23 +0100 Subject: [PATCH 1/2] Added recommend resolutions for the "easier to fix" situations TA Tool is designed to highlight problems. It would be easier for customers/TSEs if we could place examples of the commands to rectify known issues, or at least how they would start fixing situations. --- scripts.d/ta/290_check_traces_free_space.sh | 5 ++++- scripts.d/ta/390_data_folder.sh | 1 + scripts.d/ta/400_s3_using_etcd.sh | 8 +++++--- .../ta/420_check_cross-numa_zone_memory_balance.sh | 2 ++ scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh | 7 +++++++ scripts.d/ta/440_hostnames_rfc952.sh | 1 + scripts.d/ta/450_custom_ca_certs.sh | 2 ++ scripts.d/ta/460_ip_source-based_routing.sh | 3 +++ scripts.d/ta/470_number_of_numa_domains.sh | 11 +++++++++++ scripts.d/ta/480_check_weka_agent.sh | 6 +++++- scripts.d/ta/490_ip_route_metrics.sh | 2 ++ scripts.d/ta/500_sysctl_rp_filter.sh | 4 +++- scripts.d/ta/510_check_for_noprefixroute.sh | 5 ++++- scripts.d/ta/520_bucket_and_process_uptime.sh | 10 +++++++++- scripts.d/ta/530_high_drive_read_ssd_ratio.sh | 3 +++ scripts.d/ta/550_iptables_nats_local_traffic.sh | 2 ++ scripts.d/ta/560_check_for_swap.sh | 1 + scripts.d/ta/570_does_weka_use_swap.sh | 4 ++++ scripts.d/ta/580_weka_version_available_everywhere.sh | 2 ++ scripts.d/ta/590_single_dns_entry.sh | 1 + scripts.d/ta/610_nfs_aliases_sbr.sh | 2 ++ scripts.d/ta/620_same_mtu_across_nics.sh | 4 ++++ scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh | 5 +++++ scripts.d/ta/640_opt_weka_is_not_symlink.sh | 3 +++ scripts.d/ta/650_firewall_check_quick.sh | 5 ++++- scripts.d/ta/660_hugepages_check.sh | 6 +++++- scripts.d/ta/670_crowdstrike_check.sh | 4 ++++ scripts.d/ta/670_nm_ignore_carrier.sh | 5 ++++- scripts.d/ta/680_redundant_weka_overrides.sh | 2 ++ scripts.d/ta/690_auto_core_in_mcb.sh | 8 ++++++++ scripts.d/ta/700_wekapp351707.sh | 1 + scripts.d/ta/710_no_spaces_in_cluster_name.sh | 3 +++ scripts.d/ta/720_low_compute_ram_to_ssd.sh | 5 +++++ scripts.d/ta/730_large_drives.sh | 2 +- scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh | 3 +++ scripts.d/ta/740_mlx_settings.sh | 6 +++++- scripts.d/ta/755_wekapp424920_smbw_mask.sh | 6 +++++- scripts.d/ta/765_process_network_mode.sh | 3 ++- scripts.d/ta/775_dup_arp_check.sh | 1 + scripts.d/ta/785_asymmetric_mtu.sh | 4 ++++ scripts.d/ta/790_raft_agents.sh | 1 + scripts.d/ta/795_netmask_mismatch.sh | 5 +++++ scripts.d/ta/805_lacp_hash_check.sh | 5 ++++- scripts.d/ta/810_use_only_readcache_for_protocols.sh | 9 +++++++++ scripts.d/ta/815_no_spaces_in_fs_name.sh | 3 +++ 45 files changed, 165 insertions(+), 16 deletions(-) diff --git a/scripts.d/ta/290_check_traces_free_space.sh b/scripts.d/ta/290_check_traces_free_space.sh index 3f8b297..ed556d9 100755 --- a/scripts.d/ta/290_check_traces_free_space.sh +++ b/scripts.d/ta/290_check_traces_free_space.sh @@ -52,7 +52,10 @@ if (( ${WEKA_ENSURE_FREE} > ${TRACES_FS_SIZE})) ; then echo "Weka is currently set to ensure that ${WEKA_ENSURE_FREE} bytes are free" echo "on ${WEKA_TRACES_DIR}, but this filesystem is only ${TRACES_FS_SIZE} bytes" echo "in size. These conditions cannot co-exist, so the outcome is that no" - echo "traces will be stored" + echo "traces will be stored." + echo "Recommended options:" + echo " . Increase the size of ${WEKA_TRACES_DIR}" + echo " . Reduce the size of traces with \"weka debug traces retention set --server-ensure-free XXXX\"" RETURN_CODE=1 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/390_data_folder.sh b/scripts.d/ta/390_data_folder.sh index bed2b92..e0a8039 100755 --- a/scripts.d/ta/390_data_folder.sh +++ b/scripts.d/ta/390_data_folder.sh @@ -29,6 +29,7 @@ if [ -d "/data" ] ; then else echo "to ${JIRA_REFERENCE}, SFDC ${KB_REFERENCE}" fi + echo "The recommend fix is to upgrade your version of Weka" RETURN_CODE=1 fi fi diff --git a/scripts.d/ta/400_s3_using_etcd.sh b/scripts.d/ta/400_s3_using_etcd.sh index 6365e61..fb38e85 100755 --- a/scripts.d/ta/400_s3_using_etcd.sh +++ b/scripts.d/ta/400_s3_using_etcd.sh @@ -40,12 +40,14 @@ if [ ${WEKA_S3_RUNNING} -ge 1 ] ; then if verlte ${MIN_VERSION} ${WEKA_VERSION} && verlte ${WEKA_VERSION} ${MAX_VERSION} ; then WEKA_ETCD_HOSTS=$(weka s3 cluster --json | python3 -c 'import sys, json; data = json.load(sys.stdin); print(len(data["etcd_cluster_hosts"]))') if [ ${WEKA_ETCD_HOSTS} -gt 0 ] ; then - echo "S3 cluster is running, and this version of Weka requires migration" + echo "S3 cluster is running, and this version of Weka requires a configuration change." if [[ ! -z "${WTA_REFERENCE}" ]]; then - echo "to ${JIRA_REFERENCE}, discussed in ${WTA_REFERENCE}, SFDC ${KB_REFERENCE}" + echo "Refer to ${JIRA_REFERENCE}, discussed in ${WTA_REFERENCE}, SFDC ${KB_REFERENCE}" else - echo "to ${JIRA_REFERENCE}, SFDC ${KB_REFERENCE}" + echo "Refer to ${JIRA_REFERENCE}, SFDC ${KB_REFERENCE}" fi + echo "If you require the S3 service, please contact Customer Success indicating" + echo " you need to move the S3 service from ETCD to KWAS, as indicated in KB 1181" RETURN_CODE=254 fi fi diff --git a/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh b/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh index 751199b..7f76ba3 100755 --- a/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh +++ b/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh @@ -37,6 +37,8 @@ if [[ ${RATIO_SEEN} -gt ${MAX_ALLOWED_RATIO} ]]; then echo "from starting due to lack of NUMA zone-local memory" echo "The ratio is ${RATIO_SEEN}% and the maximum allowed ratio is ${MAX_ALLOWED_RATIO}%" echo "The memory in the highest zone is ${MAX_MEMORY_SEEN_KB} and in the lowest zone is ${MIN_MEMORY_SEEN_KB}" + echo "One recommend resolution is to balance the memory between NUMA zones by physically" + echo "moving memory, or by adding more to the smaller NUMA zone" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh b/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh index db1e332..8f1eb0a 100755 --- a/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh +++ b/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh @@ -27,6 +27,13 @@ echo ${WEKA_SSD_USED_BYTES} # if we've allocated more than half the maximum theoretical SSD space, warn if [[ $((${WEKA_SSD_USED_BYTES}*2)) -gt ${WEKA_THEORETICAL_MAX_SSD_BYTES} ]] ; then + echo "You have used a significant proporation of the theoretical maximum" + echo "NVME capacity of the cluster which is decided at first install time." + echo "Please contact customer success to discuss options. Possible actions include:" + echo " . Adding an Object Store to expand data storage while keeping NVME capacity down" + echo " . In-place cluster resizing and migration (perhaps via snap2obj for fast backup/restore)" + echo " . Migrating to a different, larger cluster" + echo " . Pruning unnecessary data" RETURN_CODE=254 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/440_hostnames_rfc952.sh b/scripts.d/ta/440_hostnames_rfc952.sh index b778a31..452aaf3 100644 --- a/scripts.d/ta/440_hostnames_rfc952.sh +++ b/scripts.d/ta/440_hostnames_rfc952.sh @@ -23,6 +23,7 @@ GREP_RESULT=$(echo ${SHORT_HOSTNAME} | grep "[^-a-z0-9.]") if [[ $? -eq 0 ]]; then echo "The hostname ${SHORT_HOSTNAME} appears to contain a character other than [a-z], -, and [0-9]." echo "Refer to RFC 952 for more information" + echo "Recommended resolution: change the hostname to include only alphanumerics and underscore" RETURN_CODE=254 fi diff --git a/scripts.d/ta/450_custom_ca_certs.sh b/scripts.d/ta/450_custom_ca_certs.sh index 546e68e..8ca8391 100755 --- a/scripts.d/ta/450_custom_ca_certs.sh +++ b/scripts.d/ta/450_custom_ca_certs.sh @@ -15,6 +15,8 @@ grep -q SSL_CERT_FILE /opt/weka/dist/release/${WEKA_VERSION}.spec 2>/dev/null if [[ $? -eq 0 ]] ; then echo "This version of weka appears to use custom CA certificates. Care will be needed for upgrading" + echo "Recommended resolution: remove custom CA specification, and upgrade to a more recent" + echo "version that natively supports additional CA bundles" RETURN_CODE=254 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/460_ip_source-based_routing.sh b/scripts.d/ta/460_ip_source-based_routing.sh index 7b1519a..02b5a0a 100755 --- a/scripts.d/ta/460_ip_source-based_routing.sh +++ b/scripts.d/ta/460_ip_source-based_routing.sh @@ -78,6 +78,9 @@ if [[ ${SOURCE_BASED_ROUTING_RECOMMENDED} -ge "1" ]] ; then echo "Warning: Not every interface appears to have arp_filter=1 set. This could lead to communication problems" RETURN_CODE="254" fi + echo "Recommended resolution: Although networking is typically site- and hardware-dependent," + echo " some example configurations for the common dual NIC setup are noted on the WEKA" + echo " documentation site: https://docs.weka.io/planning-and-installation/bare-metal/setting-up-the-hosts#configure-the-ha-networking" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/470_number_of_numa_domains.sh b/scripts.d/ta/470_number_of_numa_domains.sh index 7351808..15d1c67 100755 --- a/scripts.d/ta/470_number_of_numa_domains.sh +++ b/scripts.d/ta/470_number_of_numa_domains.sh @@ -36,6 +36,8 @@ echo -n "Detected $NUMBER_OF_NUMA_DOMAINS NUMA domains - " if [[ $NUMBER_OF_NUMA_DOMAINS -gt $MAXIMUM_NUMA_DOMAINS ]]; then RETURN_CODE=254 echo "Weka currenty only supports a maximum of 32 NUMA domains (4.2.11+)." + echo " Recommended resolution: reduce the number of NUMA domains, perhaps by reducing" + echo " the NUMAs per socket setting in the machine's firmware" # 8 or fewer NUMAs is always supported elif [[ $NUMBER_OF_NUMA_DOMAINS -le 8 ]]; then @@ -45,16 +47,25 @@ elif [[ $NUMBER_OF_NUMA_DOMAINS -le 8 ]]; then elif vergte $WEKA_VERSION "4.3.0" && verlt $WEKA_VERSION "4.3.2" && [[ $NUMBER_OF_NUMA_DOMAINS -gt 16 ]]; then RETURN_CODE=254 echo "Weka only supports more than 16 NUMA domains in 4.3.2 and higher." + echo " Recommended resolutions: either" + echo " . Reduce the number of NUMA domains, perhaps by reducing the NUMAs per socket setting in the machine's firmware" + echo " . Upgrade Weka to a more recent version" # More than 16 NUMAs only supported in 4.2.11+ elif vergt $WEKA_VERSION "4.2.6" && verlt $WEKA_VERSION "4.2.11" && [[ $NUMBER_OF_NUMA_DOMAINS -gt 16 ]]; then RETURN_CODE=254 echo "Weka only supports more than 16 NUMA domains in (4.2.11+, 4.3.2+)." + echo " Recommended resolutions: either" + echo " . Reduce the number of NUMA domains, perhaps by reducing the NUMAs per socket setting in the machine's firmware" + echo " . Upgrade Weka to a more recent version" # More than 8 NUMAs only supported in 4.2.7+ elif verlt $WEKA_VERSION "4.2.7" && [[ $NUMBER_OF_NUMA_DOMAINS -gt 8 ]]; then RETURN_CODE=254 echo "Weka only supports more than 8 NUMA domains in 4.2.7 and higher." + echo " Recommended resolutions: either" + echo " . Reduce the number of NUMA domains, perhaps by reducing the NUMAs per socket setting in the machine's firmware" + echo " . Upgrade Weka to a more recent version" else echo "Number of NUMA domains is within supported limits." fi diff --git a/scripts.d/ta/480_check_weka_agent.sh b/scripts.d/ta/480_check_weka_agent.sh index b73b24f..2531b45 100755 --- a/scripts.d/ta/480_check_weka_agent.sh +++ b/scripts.d/ta/480_check_weka_agent.sh @@ -17,11 +17,15 @@ if [[ $? -ne "0" ]] ; then RETURN_CODE=254 echo "The service weka-agent is not reported as enabled by systemd" echo "This may cause weka to fail to start" + echo " Recommended Resolution: enable the service with systemctl enable weka-agent" - if [[ ! -L /etc/init.d ]]; then echo "/etc/init.d is expected to be a symlink to /etc/rc.d/init.d" echo "Without this systemd is unable to find and thus start the weka-agent sysV init script" + echo " Recommended Resolution: on RHEL-based OSes move any scripts to /etc/rc.d/init.d, remove" + echo " the /etc/init.d directory, and re-create it as a link. The following commands are" + echo " one way to achieve this" + echo " mv /etc/init.d/* /etc/rc.d/init.d/ && rmdir /etc/init.d && ln -s /etc/rc.d/init.d /etc/init.d" fi fi diff --git a/scripts.d/ta/490_ip_route_metrics.sh b/scripts.d/ta/490_ip_route_metrics.sh index de95023..0e01651 100755 --- a/scripts.d/ta/490_ip_route_metrics.sh +++ b/scripts.d/ta/490_ip_route_metrics.sh @@ -46,6 +46,8 @@ if [[ ${NUMBER_OF_OVERLAPPING_ROUTES_WITH_METRICS} -gt "1" ]]; then echo "that these entries will negatively affect the performance of e.g. floating IP" echo "addresses. In any case it is unlikely that preferential IP routes are of" echo "benefit in a high-performance local network" + echo "Recommended Resolution: review the output of \"ip route\" and rationalise the routes," + echo " likely by removing or coalescing the overlapping routes into larger ranges" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/500_sysctl_rp_filter.sh b/scripts.d/ta/500_sysctl_rp_filter.sh index 1cdf643..6229554 100755 --- a/scripts.d/ta/500_sysctl_rp_filter.sh +++ b/scripts.d/ta/500_sysctl_rp_filter.sh @@ -35,16 +35,18 @@ if [[ $RP_FILTER_VALUE_ALL != "2" ]]; then echo "The value for net.ipv4.conf.${INTERFACE}.rp_filter is set to ${RP_FILTER_VALUE}." echo "This can disrupt floating IP addresses for protocols." echo "It is recommended to set net.ipv4.conf.${INTERFACE}.rp_filter to 2." + echo "Recommended resolution: set this value in e.g. /etc/sysctl.d/99-weka-nics.conf" elif [[ $RP_FILTER_VALUE_ALL == "1" && $RP_FILTER_VALUE == "0" ]]; then RETURN_CODE="254" echo "The value for net.ipv4.conf.${INTERFACE}.rp_filter is set to ${RP_FILTER_VALUE}." echo "The value for net.ipv4.conf.all.rp_filter is set to ${RP_FILTER_VALUE_ALL} and takes precedence." echo "This can disrupt floating IP addresses for protocols." echo "It is recommended to set net.ipv4.conf.${INTERFACE}.rp_filter or net.ipv4.conf.all.rp_filter to 2." + echo "Recommended resolution: set this value in e.g. /etc/sysctl.d/99-weka-nics.conf" fi done else echo "net.ipv4.conf.all.rp_filter is set to 2, no further testing necessary." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/scripts.d/ta/510_check_for_noprefixroute.sh b/scripts.d/ta/510_check_for_noprefixroute.sh index 25973ee..135b5fe 100755 --- a/scripts.d/ta/510_check_for_noprefixroute.sh +++ b/scripts.d/ta/510_check_for_noprefixroute.sh @@ -28,7 +28,10 @@ if [[ "${NOPREFIXROUTE_COUNT}" != "0" ]]; then echo "Certain IP addresses are configured with noprefixroute. This will inhibit the ability" echo "of certain cluster floating ips to accurately determine which link should be preferred" echo "The command \"ip -o -f inet route list match xxx.xxx.xxx.xxx/32 scope link\" needs to" - echo "Be able to return a device for each floating IP configured" + echo "be able to return a device for each floating IP configured" + echo "Recommended Resolution: remove the noprefixroute flag or otherwise ensure the IP" + echo " ip route list command given above can resolve the link on which you wish the" + echo " floating IP to be configured" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/520_bucket_and_process_uptime.sh b/scripts.d/ta/520_bucket_and_process_uptime.sh index e44968f..80d3342 100755 --- a/scripts.d/ta/520_bucket_and_process_uptime.sh +++ b/scripts.d/ta/520_bucket_and_process_uptime.sh @@ -36,12 +36,20 @@ CURRENT_TIME_EPOCH=$( date +%s) if [[ $((${CURRENT_TIME_EPOCH}-${MOST_RECENT_BUCKET_STARTTIME_EPOCH})) -lt 3600 ]]; then RETURN_CODE="254" echo "Weka buckets have been restarted within the last hour, or have never started. This may not be a problem on a new cluster" - echo "but could be indicative of problems (e.g. network flapping" + echo "but could be indicative of problems (e.g. network flapping)" + echo "Recommended Resolutions:" + echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" + echo " . Otherwise the most likely cause is network problems, such as link flapping or congestion." + echo " . Review hardware and network stability, then contact customer success" fi if [[ $((${CURRENT_TIME_EPOCH}-${MOST_RECENT_PROCESS_STARTTIME_EPOCH})) -lt 3600 ]]; then RETURN_CODE="254" echo "Weka processes have been restarted within the last hour, or have never started. This may not be a problem on a new cluster" echo "but could be indicative of problems (e.g. network flapping" + echo "Recommended Resolutions:" + echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" + echo " . Otherwise the most likely cause is network problems, such as link flapping or congestion." + echo " . Review hardware and network stability, then contact customer success" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/530_high_drive_read_ssd_ratio.sh b/scripts.d/ta/530_high_drive_read_ssd_ratio.sh index 1416142..a64899e 100755 --- a/scripts.d/ta/530_high_drive_read_ssd_ratio.sh +++ b/scripts.d/ta/530_high_drive_read_ssd_ratio.sh @@ -34,6 +34,9 @@ if [[ ${HIGHER_THAN_EXPECTED} == "YES" ]]; then echo "The ratio of NVMe read requests vs DRIVE node read operations is higher than expected over the last ${TIME_TO_EXAMINE}" echo "This could indicate a number of things, such as splitting of read requests or perhaps read amplification" echo "Review ${JIRA_REFERENCE} for details" + echo "Recommended Resolutions:" + echo " . This may be expected behaviour for your workload" + echo " . The data may be read using much larger blocksizes than those in which it was written, and matching those may help" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/550_iptables_nats_local_traffic.sh b/scripts.d/ta/550_iptables_nats_local_traffic.sh index 4ca97ca..b228339 100755 --- a/scripts.d/ta/550_iptables_nats_local_traffic.sh +++ b/scripts.d/ta/550_iptables_nats_local_traffic.sh @@ -21,6 +21,7 @@ for IP_ADDRESS in $(hostname --all-ip-addresses) ; do if [[ $? -eq 0 ]] ; then echo "Warning: it is possible that traffic to or from local IP address ${IP_ADDRESS} will be subject to NAT" echo "This can cause intra-WEKA communication errors" + echo "Recommended Resolution: Do not NAT WEKA traffic" RETURN_CODE="254" fi done @@ -29,6 +30,7 @@ for IP_ROUTE in $(ip -4 --json route list | python3 -c 'import sys, json, colle if [[ $? -eq 0 ]] ; then echo "Warning: it is possible that traffic to or from subnet ${IP_ROUTE} will be subject to NAT" echo "This can cause intra-WEKA communication errors" + echo "Recommended Resolution: Do not NAT WEKA traffic" RETURN_CODE="254" fi done diff --git a/scripts.d/ta/560_check_for_swap.sh b/scripts.d/ta/560_check_for_swap.sh index e3de53d..123f4a4 100755 --- a/scripts.d/ta/560_check_for_swap.sh +++ b/scripts.d/ta/560_check_for_swap.sh @@ -14,6 +14,7 @@ SWAPTOTAL=$(grep SwapTotal /proc/meminfo | awk '{print $2}') if [[ ${SWAPTOTAL} -ne "0" ]] ; then echo "This host has swap configured - this is unlikely to be" echo "helpful in a large memory system" + echo "Recommended Resolution: if the host has enough RAM, disable swap with swapoff then disable swap at boot time (likely in /etc/fstab)" RETURN_CODE="254" fi diff --git a/scripts.d/ta/570_does_weka_use_swap.sh b/scripts.d/ta/570_does_weka_use_swap.sh index 5543182..2d83ab5 100755 --- a/scripts.d/ta/570_does_weka_use_swap.sh +++ b/scripts.d/ta/570_does_weka_use_swap.sh @@ -17,6 +17,10 @@ for WEKAPID in $(ps -eo pid,comm | grep weka_init | awk '{print $1}') ; do if [[ ${NUM_PROCS_USING_SWAP} -gt "0" ]] ; then echo "There are Weka processes using swap - this is likely to be" echo "detrimental to performance" + echo "Recommended Resolutions:" + echo " . Add more RAM if the host is truly constrained" + echo " . Review if the host has not correctly released RAM" + echo " . Reduce the amount of RAM allocated to WEKA (a last resort)" RETURN_CODE="254" fi done diff --git a/scripts.d/ta/580_weka_version_available_everywhere.sh b/scripts.d/ta/580_weka_version_available_everywhere.sh index aca379f..fc6313d 100755 --- a/scripts.d/ta/580_weka_version_available_everywhere.sh +++ b/scripts.d/ta/580_weka_version_available_everywhere.sh @@ -27,6 +27,8 @@ CURRENT_AGENT_VERSION=$(weka local status | awk 'NR==1{print $5}' | tr -d ')') if [[ ${WEKA_CLUSTER_VERSION} != ${CURRENT_AGENT_VERSION} ]] ; then echo "The currently running cluster version ${WEKA_CLUSTER_VERSION} does not match the" echo "default installed local agent version ${CURRENT_AGENT_VERSION}" + echo "Recommended Resolution: update this host to the cluster version, either by" + echo " unmounting and re-mounting filesystems or using the weka local upgrade utility" RETURN_CODE="254" fi diff --git a/scripts.d/ta/590_single_dns_entry.sh b/scripts.d/ta/590_single_dns_entry.sh index ecb4c61..a0d6303 100755 --- a/scripts.d/ta/590_single_dns_entry.sh +++ b/scripts.d/ta/590_single_dns_entry.sh @@ -43,6 +43,7 @@ fi if [[ ${NUMBER_OF_A_RECORDS} != "1" ]] ; then echo "There are ${NUMBER_OF_A_RECORDS} A records in DNS for ${HOSTNAME}" echo "This is very likely to cause problems with (at least) SMB-W clustering" + echo "Recommended Resolution: add a DNS record of type A for ${HOSTNAME} pointing to the IPv4 address" RETURN_CODE=254 else echo "There is exactly one A record in DNS for ${HOSTNAME}" diff --git a/scripts.d/ta/610_nfs_aliases_sbr.sh b/scripts.d/ta/610_nfs_aliases_sbr.sh index a834366..6a36076 100644 --- a/scripts.d/ta/610_nfs_aliases_sbr.sh +++ b/scripts.d/ta/610_nfs_aliases_sbr.sh @@ -71,6 +71,8 @@ main() { done < <(ip -4 rule | awk '{print $3}' | grep -v "all") if [[ $found_rule -eq 0 ]]; then echo "WARNING: No ip rule for address $NFS_IP! It is possible source-based routing should be configured." + echo "Recommended Resolution: configure source-based routing. Examples are mentioned in the WEKA docs:" + echo "https://docs.weka.io/planning-and-installation/bare-metal/setting-up-the-hosts#configure-the-ha-networking" RETURN_CODE=254 fi done < <(weka nfs interface-group assignment --no-header | awk '$3 == '$weka_host_id'' | awk '{print $1}') diff --git a/scripts.d/ta/620_same_mtu_across_nics.sh b/scripts.d/ta/620_same_mtu_across_nics.sh index e26a891..d08c1db 100644 --- a/scripts.d/ta/620_same_mtu_across_nics.sh +++ b/scripts.d/ta/620_same_mtu_across_nics.sh @@ -33,6 +33,10 @@ for CONTAINER in $(weka local ps --no-header | awk '{print $1}' | grep -vw -e en echo "has an MTU of ${MTU}, which is less than the MTU ${SMALLEST_MTU_REQUIRED} seen elsewhere in this host" echo "This can lead to cluster communication problems" echo "Please see ${JIRA_REFERENCE} for more information" + echo "Recommended Resolution: Increase the MTUs of all NICs in the cluster to at least ${SMALLEST_MTU_REQUIRED}" + echo "Review your OS documentation for how to set this permanently, but NetworkManager-based OSes will use" + echo "something like \"nmcli connection modify eno1 802-3-ethernet.mtu ${SMALLEST_MTU_REQUIRED}\" and then" + echo "\"nmcli connection apply eno1\", but connection names will vary" RETURN_CODE=254 fi done diff --git a/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh b/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh index f00e76c..f05ba7c 100644 --- a/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh +++ b/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh @@ -29,6 +29,11 @@ main() { echo echo "This means that changes made to the live system as it is now" echo "are unlikely to be present on the system post-reboot" + echo "Recommended Resolution:" + echo " . Do NOT reboot the host" + echo " . You need to verify that the on-boot configuration for /opt/weka" + echo " matches and uses the currently used layout. This may involve" + echo " editing filesystem layouts in /etc/fstab or systemd" RETURN_CODE=254 else echo "No immediate directory/mount overlaps found" diff --git a/scripts.d/ta/640_opt_weka_is_not_symlink.sh b/scripts.d/ta/640_opt_weka_is_not_symlink.sh index 5605ce3..9f50af6 100644 --- a/scripts.d/ta/640_opt_weka_is_not_symlink.sh +++ b/scripts.d/ta/640_opt_weka_is_not_symlink.sh @@ -13,6 +13,9 @@ main() { if [[ -L /opt/weka ]] ; then echo "/opt/weka is a symlink. This is not supported and" echo "is very unlikely to work due to chroot-style container behaviour" + echo "Recommended Resolution: Do not install Weka in a symlink" + echo "Resolving this can involve a rolling deactivation and re-installation" + echo "of Weka, depending on how and why this was done" RETURN_CODE=254 else echo "/opt/weka is not a symlink. This is ok" diff --git a/scripts.d/ta/650_firewall_check_quick.sh b/scripts.d/ta/650_firewall_check_quick.sh index 0cbd15a..e8dd00b 100644 --- a/scripts.d/ta/650_firewall_check_quick.sh +++ b/scripts.d/ta/650_firewall_check_quick.sh @@ -54,6 +54,9 @@ for ip in ${!BACKEND_IPS[@]}; do for port in ${ports[@]}; do if (! echo -n 2>/dev/null < /dev/tcp/$ip/$port); then echo "WARN: Unable to connect to $ip tcp/$port" + echo "Recommended Resolution: There is likely something blocking network communication between" + echo "this host and ${ip} tcp/${port}. Please review network connectivity and/or firewalls" + echo "In particular DDOS-style protection on switches may prevent communication" RETURN_CODE=254 fi done @@ -67,4 +70,4 @@ if [[ ${RETURN_CODE} -eq 0 ]]; then echo "No backend management ports blocked." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/scripts.d/ta/660_hugepages_check.sh b/scripts.d/ta/660_hugepages_check.sh index 1e49e48..96443d6 100644 --- a/scripts.d/ta/660_hugepages_check.sh +++ b/scripts.d/ta/660_hugepages_check.sh @@ -28,6 +28,8 @@ if [[ -n $WEKA_HUGE_1G ]]; then if [[ $DIFF_1G != 0 ]]; then RETURN_CODE=254 echo "Discrepancy of $DIFF_1G 1GiB hugepage(s) between Weka and OS." + echo "Recommended Resolution: Review if other applications (such as hypervisors) are" + echo "using hugepages. If they are, this may be expected." fi fi @@ -37,6 +39,8 @@ if [[ -n $WEKA_HUGE_2M ]]; then if [[ $DIFF_2M != 0 ]]; then RETURN_CODE=254 echo "Discrepancy of $DIFF_2M 2MiB hugepage(s) between Weka and OS." + echo "Recommended Resolution: Review if other applications (such as hypervisors) are" + echo "using hugepages. If they are, this may be expected." fi fi @@ -45,4 +49,4 @@ if [[ $RETURN_CODE -eq 0 ]]; then echo "No hugepages allocation discrepancy." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/scripts.d/ta/670_crowdstrike_check.sh b/scripts.d/ta/670_crowdstrike_check.sh index dcab6ab..31497f1 100644 --- a/scripts.d/ta/670_crowdstrike_check.sh +++ b/scripts.d/ta/670_crowdstrike_check.sh @@ -14,9 +14,13 @@ RETURN_CODE=0 if systemctl status falcon-sensor &> /dev/null; then echo "Warning: CrowdStrike Falcon Sensor is running" + echo "Recommended Resolution: we do not recommend using this software in conjunction with WEKA as" + echo "it has been shown to cause problems unloading kernel modules" exit 254 elif lsmod | grep -q -m 1 falcon_lsm; then echo "Warning: Crowdstrike Falcon kernel module loaded" + echo "Recommended Resolution: we do not recommend using this software in conjunction with WEKA as" + echo "it has been shown to cause problems unloading kernel modules" exit 254 fi echo "CrowdStrike Falcon Sensor is not running" diff --git a/scripts.d/ta/670_nm_ignore_carrier.sh b/scripts.d/ta/670_nm_ignore_carrier.sh index 8431e97..ca831f2 100644 --- a/scripts.d/ta/670_nm_ignore_carrier.sh +++ b/scripts.d/ta/670_nm_ignore_carrier.sh @@ -20,6 +20,9 @@ if nmcli -v &> /dev/null; then elif [[ "$IGNORE_CARRIER" != "*" ]]; then RETURN_CODE=254 echo "NetworkManager ignore-carrier is set to ${IGNORE_CARRIER}, but recommended value is ignore-carrier=*" + echo "Recommended Resolution: set ignore-carrier=* in NetworkManager, perhaps with the following commands" + echo " echo -e '[main]\\nignore-carrier=*' > /etc/NetworkManager/conf.d/99-carrier.conf " + echo " systemctl restart NetworkManager " else echo "NetworkManager ignore-carrier=* exists." fi @@ -30,4 +33,4 @@ else echo "NetworkManager not in use." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/scripts.d/ta/680_redundant_weka_overrides.sh b/scripts.d/ta/680_redundant_weka_overrides.sh index 942303f..3c31865 100644 --- a/scripts.d/ta/680_redundant_weka_overrides.sh +++ b/scripts.d/ta/680_redundant_weka_overrides.sh @@ -51,6 +51,8 @@ while read CURRENT_OVERRIDE; do REDUNDANT_FROM_VERSION=${REDUNDANT_OVERRIDE_LIST[${CURRENT_OVERRIDE}]} if verlte ${REDUNDANT_FROM_VERSION} ${CURRENT_WEKA_VERSION} ; then echo "Override ${CURRENT_OVERRIDE} is no longer necessary as of v${REDUNDANT_FROM_VERSION}" + echo "Recommended Resolution: Contact customer success and query if this override can" + echo "be disabled and subsequently removed" RETURN_CODE=254 fi done < <(weka debug override list --output key --no-header) diff --git a/scripts.d/ta/690_auto_core_in_mcb.sh b/scripts.d/ta/690_auto_core_in_mcb.sh index 049ed8f..3636f17 100644 --- a/scripts.d/ta/690_auto_core_in_mcb.sh +++ b/scripts.d/ta/690_auto_core_in_mcb.sh @@ -14,6 +14,14 @@ for WEKA_CONTAINER in $(weka local ps --output name --no-header | grep -E '(driv MATCHES=$(weka local resources -C ${WEKA_CONTAINER} | grep -cE '^(DRIVES|COMPUTE|FRONTEND) *[0-9].*auto') if [[ ${MATCHES} -ne 0 ]] ; then echo "Host ${HOSTNAME} has auto-core allocation in MCB container ${WEKA_CONTAINER}" + echo "Recommended Resolution: reconfigure the local resources to use a fixed CPU core, such as" + if [[ ${WEKA_CONTAINER} =~ "drive" ]] ; then + echo "weka local resources cores --container ${WEKA_CONTAINER} --only-drives-cores --core-ids X,Y,Z" + elif [[ ${WEKA_CONTAINER} =~ "compute" ]] ; then + echo "weka local resources cores --container ${WEKA_CONTAINER} --only-compute-cores --core-ids X,Y,Z" + elif [[ ${WEKA_CONTAINER} =~ "frontend" ]] ; then + echo "weka local resources cores --container ${WEKA_CONTAINER} --only-frontend-cores --core-ids X,Y,Z" + fi exit 254 fi done diff --git a/scripts.d/ta/700_wekapp351707.sh b/scripts.d/ta/700_wekapp351707.sh index fb656cf..7cc8604 100644 --- a/scripts.d/ta/700_wekapp351707.sh +++ b/scripts.d/ta/700_wekapp351707.sh @@ -39,6 +39,7 @@ if [[ $WEKA_VERSION = "4.2.7.64" || $WEKA_VERSION = "4.2.8.66" ]]; then echo "SSD metadata exceeds more than half of available SSD space on one or more filesystems." echo "Possibly vulnerable to WEKAPP-351707." echo "Consider adding the fs_backpressure_skip_ssdwritecache_estimation_all override." + echo "Recommended resolution: upgrade to a version beyond 4.2.9.x" fi fi done < <(weka fs -R --no-header -o availableSSD,usedSSDM,stores | sed -e 's/B//g' | awk '{print $1, $2, $3}') diff --git a/scripts.d/ta/710_no_spaces_in_cluster_name.sh b/scripts.d/ta/710_no_spaces_in_cluster_name.sh index 349e013..2a209f8 100644 --- a/scripts.d/ta/710_no_spaces_in_cluster_name.sh +++ b/scripts.d/ta/710_no_spaces_in_cluster_name.sh @@ -27,6 +27,9 @@ WEKA_CLUSTER_NAME=$(weka status | grep cluster: | sed -e 's/^ *cluster: *//' -e if [[ ${WEKA_CLUSTER_NAME} = *" "* ]]; then echo "Weka cluster name contains spaces" echo "This will prevent an S3 cluster from starting - see KB ${KB_REFERENCE}" + NEW_RECOMMENDED_NAME=$(echo ${WEKA_CLUSTER_NAME} | sed 's/ /_/g') + echo "Recommended resolution: update the cluster name, e.g. using:" + echo " weka cluster update --cluster-name ${NEW_RECOMMENDED_NAME}" RETURN_CODE=254 else echo "Weka cluster name does not contain spaces" diff --git a/scripts.d/ta/720_low_compute_ram_to_ssd.sh b/scripts.d/ta/720_low_compute_ram_to_ssd.sh index f14a276..4ffa80f 100644 --- a/scripts.d/ta/720_low_compute_ram_to_ssd.sh +++ b/scripts.d/ta/720_low_compute_ram_to_ssd.sh @@ -30,6 +30,11 @@ RAM_TO_SSD_RATIO=$(echo "${WEKA_SSD_CAPACITY}/${WEKA_COMPUTE_RAM}" | bc) if [[ ${RAM_TO_SSD_RATIO} -gt 4000 ]]; then echo "Warning: there is more than 4000 times the RAM capacity in total NVME capacity" echo "This may lead to Weka bucket startup issues. Refer to ${JIRA_REFERENCE}" + echo "Recommended Resolution: add more memory to cluster - options include:" + echo " . Increasing the amount of memory allocated to COMPUTE processes if there's spare" + echo " . Increasing the amount RAM installed, then doing the above" + echo " . Scaling out by adding more hosts" + echo " . Reducing the size of the NVME by removing drives or tiering to Object Store" RETURN_CODE=254 else echo "RAM to SSD ratio is acceptable" diff --git a/scripts.d/ta/730_large_drives.sh b/scripts.d/ta/730_large_drives.sh index e054d73..ef5f4e5 100755 --- a/scripts.d/ta/730_large_drives.sh +++ b/scripts.d/ta/730_large_drives.sh @@ -43,7 +43,7 @@ if verlt ${WEKA_VERSION} "4.1.2" && [[ ${LARGEST_SSD} -gt ${LARGEST_SUPPORTED_SS RETURN_CODE=254 echo "Weka only supports SSDs larger than ${LARGEST_SUPPORTED_SSD} in versions after 4.1.2" echo "Refer to ${KB_REFERENCE} or ${JIRA_REFERENCE} for more information" - + echo "Recommended Resolution: upgrade to our latest LTS release" else echo "No SSDs are beyond supported capacities" fi diff --git a/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh b/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh index 3e43a31..8182fa3 100644 --- a/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh +++ b/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh @@ -31,6 +31,9 @@ else for CONTAINER in $(weka local ps --no-header | awk '{print $1}' | grep -w -e ganesha -e smbw -e s3) ; do RETURN_CODE=254 echo "Protocol container ${CONTAINER} is not yet compatible with cgroup mode ${CURRENT_CGROUP_MODE}" + echo "Recommended Resolution: reboot the host with cgroup v1 enabled, likely by adding" + echo "\"systemd.unified_cgroup_hierarchy=false\" to e.g. /etc/default/grub's DEFAULT line and" + echo "running \"update-grub\" (OS-dependent)" done fi diff --git a/scripts.d/ta/740_mlx_settings.sh b/scripts.d/ta/740_mlx_settings.sh index 36c9706..d9c416d 100644 --- a/scripts.d/ta/740_mlx_settings.sh +++ b/scripts.d/ta/740_mlx_settings.sh @@ -65,6 +65,10 @@ fi if [[ $RETURN_CODE -eq 0 ]]; then echo "Mellanox NIC settings correctly set." +else + echo "Mellanox NIC settings are not as recommended. Recommended Resolution:" + echo 'for dev in $(ls /sys/class/infiniband/); do sudo mlxconfig -y -d ${dev} set ADVANCED_PCI_SETTINGS=1 PCI_WR_ORDERING=1 ; done' + echo "Followed by rebooting this host, one at a time" fi -exit $RETURN_CODE \ No newline at end of file +exit $RETURN_CODE diff --git a/scripts.d/ta/755_wekapp424920_smbw_mask.sh b/scripts.d/ta/755_wekapp424920_smbw_mask.sh index 2d3ef71..c819f75 100644 --- a/scripts.d/ta/755_wekapp424920_smbw_mask.sh +++ b/scripts.d/ta/755_wekapp424920_smbw_mask.sh @@ -34,11 +34,15 @@ if weka smb cluster | awk '/Type:/ && /smbw/' &> /dev/null; then if [[ $NUM_SHARES -ne $NUM_FILE_MASKS ]]; then echo "WARN: there are $NUM_SHARES smbw shares, but only $NUM_FILE_MASKS shares with force_create_mode" + echo "Recommended Resolution: for each share, delete and re-create it to ensure this mode is set." + echo " NB: this will likely be service-affecting" RETURN_CODE=254 fi if [[ $NUM_SHARES -ne $NUM_DIR_MASKS ]]; then echo "WARN: there are $NUM_SHARES smbw shares, but only $NUM_DIR_MASKS shares with force_directory_mode" + echo "Recommended Resolution: for each share, delete and re-create it to ensure this mode is set." + echo " NB: this will likely be service-affecting" RETURN_CODE=254 fi else @@ -55,4 +59,4 @@ if [[ $RETURN_CODE -eq 0 ]]; then echo "Not vulnerable to WEKAPP-424920 - smbw shares properly defined" fi -exit $RETURN_CODE \ No newline at end of file +exit $RETURN_CODE diff --git a/scripts.d/ta/765_process_network_mode.sh b/scripts.d/ta/765_process_network_mode.sh index f269a65..43d5d54 100644 --- a/scripts.d/ta/765_process_network_mode.sh +++ b/scripts.d/ta/765_process_network_mode.sh @@ -30,6 +30,7 @@ for ROLE in COMPUTE DRIVES; do if [[ $(weka cluster process -F role=${ROLE} -o netmode --no-header | sort | uniq | wc -l) -gt 1 ]]; then RETURN_CODE=254 echo "WARNING: $ROLE process network modes are inconsistent" + echo "Recommended Resolution: contact Customer Success to ensure that each container is defined correctly" fi done @@ -38,4 +39,4 @@ if [[ $RETURN_CODE -eq 0 ]]; then echo "Backend process network modes are consistent." fi -exit $RETURN_CODE \ No newline at end of file +exit $RETURN_CODE diff --git a/scripts.d/ta/775_dup_arp_check.sh b/scripts.d/ta/775_dup_arp_check.sh index 71d0a0f..4bf2763 100644 --- a/scripts.d/ta/775_dup_arp_check.sh +++ b/scripts.d/ta/775_dup_arp_check.sh @@ -31,6 +31,7 @@ fi for MGMT_IP in $(weka cluster container net -o ips --no-header | tr ',' '\n' | tr -d " " | sort -u); do if [[ $(ip -br neigh | grep ${MGMT_IP} | awk '{print $3}' | sort -u | wc -l) -gt 1 ]]; then echo "WARN: Duplicate arp entry found for IP ${MGMT_IP}" + echo "Recommended Resolution: check for IP clashes, and that there is a 1:1 mapping for IP:MACs" RETURN_CODE=254 fi done diff --git a/scripts.d/ta/785_asymmetric_mtu.sh b/scripts.d/ta/785_asymmetric_mtu.sh index b783a1e..098ee4c 100644 --- a/scripts.d/ta/785_asymmetric_mtu.sh +++ b/scripts.d/ta/785_asymmetric_mtu.sh @@ -15,6 +15,10 @@ for INDIVIDUAL_DRIVE_PROCESS in $(weka cluster process --backends --filter role= if [[ $(weka debug net peers --no-header ${INDIVIDUAL_DRIVE_PROCESS} --output inMTU,outMTU | awk '{if($1 != $2) {print "yes"}}') == "yes" ]]; then host=$(weka cluster process ${INDIVIDUAL_DRIVE_PROCESS} --no-header -o hostname) echo "WARN: Asymmetric MTU detected for at least one peer of ${host}, process id ${INDIVIDUAL_DRIVE_PROCESS}" + echo "Recommended Resolution: The usual cause for this is assymetric routing, with different MTUs configured" + echo "along the two different paths. Run a tracepath/traceroute from each end of the backend<->client" + echo "connection, and determine if routes take different paths. It's likely that different will have" + echo "different pMTUs, and every intervening link on the path with the smaller MTU should be checked" RETURN_CODE=254 fi done diff --git a/scripts.d/ta/790_raft_agents.sh b/scripts.d/ta/790_raft_agents.sh index 2a1b5dd..fce15ee 100755 --- a/scripts.d/ta/790_raft_agents.sh +++ b/scripts.d/ta/790_raft_agents.sh @@ -30,6 +30,7 @@ WEKA_MAX_RAFT_AGENTS=$((${WEKA_COMPUTE_PROCESS_COUNT}*180)) if [[ ${WEKA_RAFT_AGENTS} -gt ${WEKA_MAX_RAFT_AGENTS} ]] ; then echo "The maximum number of raft agents recommended per compute node is 180. This cluster requires ${WEKA_RAFT_AGENTS} in total" + echo "Recommended resolution: scale out your cluster by adding more compute processes or perhaps backend WEKA servers" RETURN_CODE=254 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/scripts.d/ta/795_netmask_mismatch.sh b/scripts.d/ta/795_netmask_mismatch.sh index ebd40ba..81eea88 100644 --- a/scripts.d/ta/795_netmask_mismatch.sh +++ b/scripts.d/ta/795_netmask_mismatch.sh @@ -32,6 +32,11 @@ done if [[ ${RETURN_CODE} -eq 0 ]]; then echo "All Weka containers have consistent netmasks" +else + echo "Recommended Resolution: determine which of these netmasks is correct, and rectify the one with" + echo "the wrong configuration. If Weka needs re-configuring, this will be done with commands like" + echo " weka local resources --container net remove " + echo " weka local resources --container net add --ips --netmask " fi exit ${RETURN_CODE} diff --git a/scripts.d/ta/805_lacp_hash_check.sh b/scripts.d/ta/805_lacp_hash_check.sh index 4c196a5..3f83252 100644 --- a/scripts.d/ta/805_lacp_hash_check.sh +++ b/scripts.d/ta/805_lacp_hash_check.sh @@ -98,6 +98,9 @@ fi if [[ ${RETURN_CODE} -eq 0 ]]; then echo "Bonding properly configured." +else + echo "Recommended Resolution: Determine NIC compatibility with the bonding mode selected:" + echo "https://docs.weka.io/planning-and-installation/prerequisites-and-compatibility#networking-ethernet" fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/scripts.d/ta/810_use_only_readcache_for_protocols.sh b/scripts.d/ta/810_use_only_readcache_for_protocols.sh index edbc9a0..577ee8e 100644 --- a/scripts.d/ta/810_use_only_readcache_for_protocols.sh +++ b/scripts.d/ta/810_use_only_readcache_for_protocols.sh @@ -14,6 +14,15 @@ for WEKA_CONTAINER in $(sudo weka local ps --output name --no-header | grep -w - if [[ ${MOUNTS_USING_WRITECACHE} != "0" ]]; then echo "WARN: container ${WEKA_CONTAINER} - used for protocols - is using writecache on host ${HOSTNAME}" echo "Refer to ${JIRA_REFERENCE} for more details" + if [[ ${WEKA_CONTAINER} =~ "s3" ]]; then + echo "Recommended Resolution: for s3, use the following (brief service interruption):" + echo " weka s3 cluster update --mount-options readcache -f" + elif [[ ${WEKA_CONTAINER} =~ "smb" ]]; then + echo "Recommended Resolution: for smb, for each share, delete it and re-add it (service interruption)" + elif [[ ${WEKA_CONTAINER} =~ "ganesha" ]]; then + echo "Recommended Resolution: for NFS, for each share, delete it and re-add it (service interruption)" + fi + RETURN_CODE=254 fi done diff --git a/scripts.d/ta/815_no_spaces_in_fs_name.sh b/scripts.d/ta/815_no_spaces_in_fs_name.sh index babefc7..a95e7f7 100644 --- a/scripts.d/ta/815_no_spaces_in_fs_name.sh +++ b/scripts.d/ta/815_no_spaces_in_fs_name.sh @@ -26,6 +26,9 @@ while read -r WEKA_FS_NAME ; do if [[ ${WEKA_FS_NAME} = *" "* ]]; then echo "Filesystem \"${WEKA_FS_NAME}\" contains spaces" echo "This can prevent S3 buckets from being created" + NEW_RECOMMENDED_NAME=$(echo ${WEKA_FS_NAME} | sed 's/ /_/g') + echo "Recommended resolution: update the cluster name, e.g. using:" + echo " weka fs update \"${WEKA_FS_NAME}\" --new-name ${NEW_RECOMMENDED_NAME}" RETURN_CODE=254 fi done < <(weka fs --no-header --output name) From 69d6b1f87f9d648298708302075cdcc4f791d630 Mon Sep 17 00:00:00 2001 From: Jack Challen Date: Fri, 25 Oct 2024 14:26:55 +0100 Subject: [PATCH 2/2] Fix spelling etc, make spelling American standard --- scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh | 2 +- scripts.d/ta/440_hostnames_rfc952.sh | 2 +- scripts.d/ta/490_ip_route_metrics.sh | 2 +- scripts.d/ta/510_check_for_noprefixroute.sh | 2 +- scripts.d/ta/520_bucket_and_process_uptime.sh | 4 ++-- scripts.d/ta/530_high_drive_read_ssd_ratio.sh | 2 +- scripts.d/ta/755_wekapp424920_smbw_mask.sh | 4 ++-- scripts.d/ta/785_asymmetric_mtu.sh | 2 +- scripts.d/ta/795_netmask_mismatch.sh | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh b/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh index 8f1eb0a..9978f9b 100755 --- a/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh +++ b/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh @@ -27,7 +27,7 @@ echo ${WEKA_SSD_USED_BYTES} # if we've allocated more than half the maximum theoretical SSD space, warn if [[ $((${WEKA_SSD_USED_BYTES}*2)) -gt ${WEKA_THEORETICAL_MAX_SSD_BYTES} ]] ; then - echo "You have used a significant proporation of the theoretical maximum" + echo "You have used a significant proportion of the theoretical maximum" echo "NVME capacity of the cluster which is decided at first install time." echo "Please contact customer success to discuss options. Possible actions include:" echo " . Adding an Object Store to expand data storage while keeping NVME capacity down" diff --git a/scripts.d/ta/440_hostnames_rfc952.sh b/scripts.d/ta/440_hostnames_rfc952.sh index 452aaf3..7ebaa80 100644 --- a/scripts.d/ta/440_hostnames_rfc952.sh +++ b/scripts.d/ta/440_hostnames_rfc952.sh @@ -23,7 +23,7 @@ GREP_RESULT=$(echo ${SHORT_HOSTNAME} | grep "[^-a-z0-9.]") if [[ $? -eq 0 ]]; then echo "The hostname ${SHORT_HOSTNAME} appears to contain a character other than [a-z], -, and [0-9]." echo "Refer to RFC 952 for more information" - echo "Recommended resolution: change the hostname to include only alphanumerics and underscore" + echo "Recommended resolution: change the hostname to include only alphanumerics and underscores" RETURN_CODE=254 fi diff --git a/scripts.d/ta/490_ip_route_metrics.sh b/scripts.d/ta/490_ip_route_metrics.sh index 0e01651..b1e6ebe 100755 --- a/scripts.d/ta/490_ip_route_metrics.sh +++ b/scripts.d/ta/490_ip_route_metrics.sh @@ -46,7 +46,7 @@ if [[ ${NUMBER_OF_OVERLAPPING_ROUTES_WITH_METRICS} -gt "1" ]]; then echo "that these entries will negatively affect the performance of e.g. floating IP" echo "addresses. In any case it is unlikely that preferential IP routes are of" echo "benefit in a high-performance local network" - echo "Recommended Resolution: review the output of \"ip route\" and rationalise the routes," + echo "Recommended Resolution: review the output of \"ip route\" and rationalize the routes," echo " likely by removing or coalescing the overlapping routes into larger ranges" fi diff --git a/scripts.d/ta/510_check_for_noprefixroute.sh b/scripts.d/ta/510_check_for_noprefixroute.sh index 135b5fe..eae89a8 100755 --- a/scripts.d/ta/510_check_for_noprefixroute.sh +++ b/scripts.d/ta/510_check_for_noprefixroute.sh @@ -29,7 +29,7 @@ if [[ "${NOPREFIXROUTE_COUNT}" != "0" ]]; then echo "of certain cluster floating ips to accurately determine which link should be preferred" echo "The command \"ip -o -f inet route list match xxx.xxx.xxx.xxx/32 scope link\" needs to" echo "be able to return a device for each floating IP configured" - echo "Recommended Resolution: remove the noprefixroute flag or otherwise ensure the IP" + echo "Recommended Resolution: remove the noprefixroute flag or otherwise ensure the" echo " ip route list command given above can resolve the link on which you wish the" echo " floating IP to be configured" fi diff --git a/scripts.d/ta/520_bucket_and_process_uptime.sh b/scripts.d/ta/520_bucket_and_process_uptime.sh index 80d3342..71d72ae 100755 --- a/scripts.d/ta/520_bucket_and_process_uptime.sh +++ b/scripts.d/ta/520_bucket_and_process_uptime.sh @@ -38,7 +38,7 @@ if [[ $((${CURRENT_TIME_EPOCH}-${MOST_RECENT_BUCKET_STARTTIME_EPOCH})) -lt 3600 echo "Weka buckets have been restarted within the last hour, or have never started. This may not be a problem on a new cluster" echo "but could be indicative of problems (e.g. network flapping)" echo "Recommended Resolutions:" - echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" + echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" echo " . Otherwise the most likely cause is network problems, such as link flapping or congestion." echo " . Review hardware and network stability, then contact customer success" fi @@ -47,7 +47,7 @@ if [[ $((${CURRENT_TIME_EPOCH}-${MOST_RECENT_PROCESS_STARTTIME_EPOCH})) -lt 3600 echo "Weka processes have been restarted within the last hour, or have never started. This may not be a problem on a new cluster" echo "but could be indicative of problems (e.g. network flapping" echo "Recommended Resolutions:" - echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" + echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" echo " . Otherwise the most likely cause is network problems, such as link flapping or congestion." echo " . Review hardware and network stability, then contact customer success" fi diff --git a/scripts.d/ta/530_high_drive_read_ssd_ratio.sh b/scripts.d/ta/530_high_drive_read_ssd_ratio.sh index a64899e..4310f22 100755 --- a/scripts.d/ta/530_high_drive_read_ssd_ratio.sh +++ b/scripts.d/ta/530_high_drive_read_ssd_ratio.sh @@ -35,7 +35,7 @@ if [[ ${HIGHER_THAN_EXPECTED} == "YES" ]]; then echo "This could indicate a number of things, such as splitting of read requests or perhaps read amplification" echo "Review ${JIRA_REFERENCE} for details" echo "Recommended Resolutions:" - echo " . This may be expected behaviour for your workload" + echo " . This may be expected behavior for your workload" echo " . The data may be read using much larger blocksizes than those in which it was written, and matching those may help" fi diff --git a/scripts.d/ta/755_wekapp424920_smbw_mask.sh b/scripts.d/ta/755_wekapp424920_smbw_mask.sh index c819f75..233bd3e 100644 --- a/scripts.d/ta/755_wekapp424920_smbw_mask.sh +++ b/scripts.d/ta/755_wekapp424920_smbw_mask.sh @@ -35,14 +35,14 @@ if weka smb cluster | awk '/Type:/ && /smbw/' &> /dev/null; then if [[ $NUM_SHARES -ne $NUM_FILE_MASKS ]]; then echo "WARN: there are $NUM_SHARES smbw shares, but only $NUM_FILE_MASKS shares with force_create_mode" echo "Recommended Resolution: for each share, delete and re-create it to ensure this mode is set." - echo " NB: this will likely be service-affecting" + echo " WARNING: this will likely be service-affecting" RETURN_CODE=254 fi if [[ $NUM_SHARES -ne $NUM_DIR_MASKS ]]; then echo "WARN: there are $NUM_SHARES smbw shares, but only $NUM_DIR_MASKS shares with force_directory_mode" echo "Recommended Resolution: for each share, delete and re-create it to ensure this mode is set." - echo " NB: this will likely be service-affecting" + echo " WARNING: this will likely be service-affecting" RETURN_CODE=254 fi else diff --git a/scripts.d/ta/785_asymmetric_mtu.sh b/scripts.d/ta/785_asymmetric_mtu.sh index 098ee4c..6792d51 100644 --- a/scripts.d/ta/785_asymmetric_mtu.sh +++ b/scripts.d/ta/785_asymmetric_mtu.sh @@ -17,7 +17,7 @@ for INDIVIDUAL_DRIVE_PROCESS in $(weka cluster process --backends --filter role= echo "WARN: Asymmetric MTU detected for at least one peer of ${host}, process id ${INDIVIDUAL_DRIVE_PROCESS}" echo "Recommended Resolution: The usual cause for this is assymetric routing, with different MTUs configured" echo "along the two different paths. Run a tracepath/traceroute from each end of the backend<->client" - echo "connection, and determine if routes take different paths. It's likely that different will have" + echo "connection, and determine if routes take different paths. It's likely that different paths will have" echo "different pMTUs, and every intervening link on the path with the smaller MTU should be checked" RETURN_CODE=254 fi diff --git a/scripts.d/ta/795_netmask_mismatch.sh b/scripts.d/ta/795_netmask_mismatch.sh index 81eea88..3ce0dd4 100644 --- a/scripts.d/ta/795_netmask_mismatch.sh +++ b/scripts.d/ta/795_netmask_mismatch.sh @@ -36,7 +36,7 @@ else echo "Recommended Resolution: determine which of these netmasks is correct, and rectify the one with" echo "the wrong configuration. If Weka needs re-configuring, this will be done with commands like" echo " weka local resources --container net remove " - echo " weka local resources --container net add --ips --netmask " + echo " weka local resources --container net add --netmask " fi exit ${RETURN_CODE}