diff --git a/bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java b/bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java index d6a25e8b..b4af779b 100644 --- a/bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java +++ b/bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java @@ -33,6 +33,7 @@ import static org.cobbzilla.util.daemon.ZillaRuntime.*; import static org.cobbzilla.util.http.HttpMethods.POST; import static org.cobbzilla.util.http.HttpStatusCodes.*; import static org.cobbzilla.util.http.HttpUtil.getResponse; +import static org.cobbzilla.util.json.JsonUtil.COMPACT_MAPPER; import static org.cobbzilla.util.json.JsonUtil.json; import static org.cobbzilla.util.system.Sleep.sleep; import static org.cobbzilla.wizard.resources.ResourceUtil.invalidEx; @@ -129,6 +130,7 @@ public class VultrDriver extends ComputeServiceDriverBase { final HttpRequestBean serverRequest = auth(new HttpRequestBean(POST, CREATE_SERVER_URL, data)); // create server, check response + if (log.isInfoEnabled()) log.info("start: calling Vultr to start node: "+node.id()); final HttpResponseBean serverResponse = serverRequest.curl(); // fixme: we can do better than shelling to curl if (serverResponse.getStatus() != 200) return die("start: error creating server: " + serverResponse); final JsonNode responseJson; @@ -138,6 +140,7 @@ public class VultrDriver extends ComputeServiceDriverBase { return die("start: error creating server (error parsing response as JSON): " + serverResponse); } final var subId = responseJson.get(VULTR_SUBID).textValue(); + if (log.isDebugEnabled()) log.debug("start: Vultr started node: "+node.id()+" SUBID="+subId); node.setState(BubbleNodeState.booting); node.setTag(TAG_INSTANCE_ID, subId); @@ -151,10 +154,10 @@ public class VultrDriver extends ComputeServiceDriverBase { sleep(SERVER_START_POLL_INTERVAL); final HttpResponseBean pollResponse = getResponse(poll); if (pollResponse.getStatus() != OK) { - return die("start: error polling subid: "+subId+": "+pollResponse); + return die("start: error polling node "+node.id()+" subid: "+subId+": "+pollResponse); } - // todo: add timeout, if server doesn't come up within X minutes, try to kill it and report an error final JsonNode serverNode = json(pollResponse.getEntityString(), JsonNode.class); + if (log.isDebugEnabled()) log.debug("start: polled node "+node.id()+" json="+json(serverNode, COMPACT_MAPPER)); if (serverNode != null) { if (serverNode.has("tag") && serverNode.get("tag").textValue().equals(cloud.getUuid()) @@ -166,7 +169,7 @@ public class VultrDriver extends ComputeServiceDriverBase { final String serverState = serverNode.get("server_state").textValue(); final String ip4 = serverNode.get(VULTR_V4_IP).textValue(); final String ip6 = serverNode.get(VULTR_V6_IP).textValue(); - // log.info("start: server_state="+serverState+", status="+status, "ip4="+ip4+", ip6="+ip6); + // if (log.isInfoEnabled()) log.info("start: server_state="+serverState+", status="+status, "ip4="+ip4+", ip6="+ip6); if (ip4 != null && ip4.length() > 0 && !ip4.equals("0.0.0.0")) { node.setIp4(ip4); @@ -181,7 +184,7 @@ public class VultrDriver extends ComputeServiceDriverBase { nodeDAO.update(node); } if (serverState.equals("ok")) { - log.info("start: server is ready: "+node.id()); + if (log.isInfoEnabled()) log.info("start: server is ready: "+node.id()); startedOk = true; break; } @@ -189,7 +192,7 @@ public class VultrDriver extends ComputeServiceDriverBase { } } if (!startedOk) { - log.error("start: timeout waiting for node to boot and become available, stopping it"); + if (log.isErrorEnabled()) log.error("start: timeout waiting for node "+node.id()+" to boot and become available, stopping it"); stop(node); } return node; @@ -210,16 +213,16 @@ public class VultrDriver extends ComputeServiceDriverBase { try { _stop(node); } catch (EntityNotFoundException e) { - log.info("stop: node stopped"); + if (log.isInfoEnabled()) log.info("stop: node stopped"); return node; } catch (Exception e) { lastEx = e; } sleep(SERVER_STOP_CHECK_INTERVAL, "stop: waiting to try stopping again until node is not found"); - log.warn("stop: node still running: "+node.id()); + if (log.isWarnEnabled()) log.warn("stop: node still running: "+node.id()); } - log.error("stop: error stopping node: "+node.id()); + if (log.isErrorEnabled()) log.error("stop: error stopping node: "+node.id()); if (lastEx != null) throw lastEx; return die("stop: timeout stopping node: "+node.id()); } @@ -231,7 +234,7 @@ public class VultrDriver extends ComputeServiceDriverBase { if (ip4 == null) { throw notFoundEx(node.id()); } - log.warn("stop: no "+TAG_INSTANCE_ID+" tag found on node ("+node.getFqdn()+"/"+ ip4 +"), searching based in ip4..."); + if (log.isWarnEnabled()) log.warn("stop: no "+TAG_INSTANCE_ID+" tag found on node ("+node.getFqdn()+"/"+ ip4 +"), searching based in ip4..."); vultrNode = findByIp4(node, ip4); } else { // does the node still exist? @@ -267,11 +270,11 @@ public class VultrDriver extends ComputeServiceDriverBase { .findFirst() .orElse(null); if (found == null) { - log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +") and no server had this ip4"); + if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +") and no server had this ip4"); return null; } if (!found.hasTag(TAG_INSTANCE_ID)) { - log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +"), cannot stop"); + if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +"), cannot stop"); return null; } return found; @@ -292,12 +295,12 @@ public class VultrDriver extends ComputeServiceDriverBase { || (ip4 != null && node.hasIp4() && ip4.textValue().equals(node.getIp4())) || (ip6 != null && node.hasIp6() && ip6.textValue().equals(node.getIp6())) ? node : null; } catch (Exception e) { - log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse+": exception="+shortError(e)); + if (log.isErrorEnabled()) log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse+": exception="+shortError(e)); return null; } case NOT_FOUND: return null; default: - log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse); + if (log.isErrorEnabled()) log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse); return null; } } @@ -320,7 +323,7 @@ public class VultrDriver extends ComputeServiceDriverBase { final String subid = iter.next(); final ObjectNode server = (ObjectNode) entity.get(subid); if (!filter.apply(server)) { - log.debug("Skipping node without cloud tag "+cloud.getUuid()+": "+subid); + if (log.isDebugEnabled()) log.debug("Skipping node without cloud tag "+cloud.getUuid()+": "+subid); continue; } final String subId = server.has(VULTR_SUBID) ? server.get(VULTR_SUBID).textValue() : null; diff --git a/bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java b/bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java index b1b9a58c..659d494b 100644 --- a/bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java +++ b/bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java @@ -9,50 +9,59 @@ import lombok.Getter; public class NodeLaunchException extends RuntimeException { + public enum NodeLaunchExceptionType { fatal, canRetry, interrupted } + @Getter private final BubbleNode node; public boolean hasNode () { return node != null; } public String nodeSummary () { return node == null ? "null" : node.id()+"/"+node.getState(); } - @Getter private final boolean fatal; + @Getter private final NodeLaunchExceptionType type; - private NodeLaunchException (BubbleNode node, Exception e, String message, boolean fatal) { + private NodeLaunchException (BubbleNode node, Exception e, String message, NodeLaunchExceptionType type) { super(message, e); this.node = node; - this.fatal = fatal; + this.type = type; } - private NodeLaunchException (BubbleNode node, Exception e, boolean fatal) { - this(node, e, e.getMessage(), fatal); + private NodeLaunchException (BubbleNode node, Exception e, NodeLaunchExceptionType type) { + this(node, e, e.getMessage(), type); } - private NodeLaunchException (BubbleNode node, String message, boolean fatal) { - this(node, null, message, fatal); + private NodeLaunchException (BubbleNode node, String message, NodeLaunchExceptionType type) { + this(node, null, message, type); } - private NodeLaunchException (Exception e, String message, boolean fatal) { - this(null, e, message, fatal); + private NodeLaunchException (Exception e, String message, NodeLaunchExceptionType type) { + this(null, e, message, type); } - private NodeLaunchException (String message, boolean fatal) { - this(null, null, message, fatal); + private NodeLaunchException (String message, NodeLaunchExceptionType type) { + this(null, null, message, type); } - private NodeLaunchException (Exception e, boolean fatal) { - this(null, e, e.getMessage(), fatal); + private NodeLaunchException (Exception e, NodeLaunchExceptionType type) { + this(null, e, e.getMessage(), type); } - public static T fatalLaunchFailure (String message) { throw new NodeLaunchException(message, true); } - public static T fatalLaunchFailure (Exception e, String message) { throw new NodeLaunchException(e, message, true); } - public static T fatalLaunchFailure (Exception e) { throw new NodeLaunchException(e, true); } - public static T fatalLaunchFailure (BubbleNode node, String message) { throw new NodeLaunchException(node, message, true); } - public static T fatalLaunchFailure (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, true); } - public static T fatalLaunchFailure (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, true); } - - public static T launchFailureCanRetry (String message) { throw new NodeLaunchException(message, false); } - public static T launchFailureCanRetry (Exception e, String message) { throw new NodeLaunchException(e, message, false); } - public static T launchFailureCanRetry (Exception e) { throw new NodeLaunchException(e, false); } - public static T launchFailureCanRetry (BubbleNode node, String message) { throw new NodeLaunchException(node, message, false); } - public static T launchFailureCanRetry (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, false); } - public static T launchFailureCanRetry (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, false); } + public static T fatalLaunchFailure (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.fatal); } + public static T fatalLaunchFailure (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.fatal); } + public static T fatalLaunchFailure (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.fatal); } + public static T fatalLaunchFailure (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.fatal); } + public static T fatalLaunchFailure (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.fatal); } + public static T fatalLaunchFailure (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.fatal); } + + public static T launchFailureCanRetry (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.canRetry); } + public static T launchFailureCanRetry (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.canRetry); } + public static T launchFailureCanRetry (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.canRetry); } + public static T launchFailureCanRetry (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.canRetry); } + public static T launchFailureCanRetry (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.canRetry); } + public static T launchFailureCanRetry (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.canRetry); } + + public static T launchInterrupted (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.interrupted); } + public static T launchInterrupted (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.interrupted); } + public static T launchInterrupted (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.interrupted); } + public static T launchInterrupted (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.interrupted); } + public static T launchInterrupted (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.interrupted); } + public static T launchInterrupted (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.interrupted); } } diff --git a/bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java b/bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java index 1cf48cec..a1b9144e 100644 --- a/bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java +++ b/bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java @@ -58,10 +58,18 @@ public class NodeLauncher implements Runnable { if (exception != null) { if (exception instanceof NodeLaunchException) { final NodeLaunchException launchException = (NodeLaunchException) exception; - if (launchException.isFatal()) { - die("NodeLauncher.run: fatal launch exception: " + shortError(launchException)); - } else { - log.warn("NodeLauncher.run: nonfatal launch exception for node " + launchException.nodeSummary() + " : " + shortError(launchException)); + switch (launchException.getType()) { + case fatal: + die("NodeLauncher.run: fatal launch exception: " + shortError(launchException)); + break; + case interrupted: + log.warn("NodeLauncher.run: launch interrupted, exiting early"); + return; + case canRetry: + log.warn("NodeLauncher.run: nonfatal launch exception for node " + launchException.nodeSummary() + " : " + shortError(launchException)); + break; + default: + die("NodeLauncher.run: unknown launch exception (type="+launchException.getType()+"): "+shortError(launchException)); } } else { die("NodeLauncher.run: fatal launch exception: " + shortError(exception)); diff --git a/bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java b/bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java index c4e2c432..406ad4e2 100644 --- a/bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java +++ b/bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java @@ -49,6 +49,7 @@ import org.cobbzilla.util.io.TempDir; import org.cobbzilla.util.system.Command; import org.cobbzilla.util.system.CommandResult; import org.cobbzilla.util.system.CommandShell; +import org.cobbzilla.util.system.SleepInterruptedException; import org.cobbzilla.wizard.api.ApiException; import org.cobbzilla.wizard.cache.redis.RedisService; import org.cobbzilla.wizard.validation.MultiViolationException; @@ -76,8 +77,7 @@ import static bubble.model.cloud.BubbleNode.TAG_ERROR; import static bubble.server.BubbleConfiguration.DEBUG_NODE_INSTALL_FILE; import static bubble.server.BubbleConfiguration.ENV_DEBUG_NODE_INSTALL; import static bubble.service.boot.StandardSelfNodeService.*; -import static bubble.service.cloud.NodeLaunchException.fatalLaunchFailure; -import static bubble.service.cloud.NodeLaunchException.launchFailureCanRetry; +import static bubble.service.cloud.NodeLaunchException.*; import static bubble.service.cloud.NodeProgressMeterConstants.*; import static java.util.concurrent.TimeUnit.MINUTES; import static java.util.concurrent.TimeUnit.SECONDS; @@ -148,6 +148,7 @@ public class StandardNetworkService implements NetworkService { NodeProgressMeter progressMeter = null; final BubbleNetwork network = nn.getNetworkObject(); final ExecutorService backgroundJobs = DaemonThreadFactory.fixedPool(3); + boolean killNode = false; try { progressMeter = launchMonitor.getProgressMeter(nn); progressMeter.write(METER_TICK_CONFIRMING_NETWORK_LOCK); @@ -410,18 +411,14 @@ public class StandardNetworkService implements NetworkService { log.info("newNode: ready in "+formatDuration(now() - start)); } catch (Exception e) { - log.error("newNode: "+e, e); - if (node != null) { - node.setState(BubbleNodeState.unknown_error); - nodeDAO.update(node); - if (!progressMeter.hasError()) progressMeter.error(METER_UNKNOWN_ERROR); - killNode(node, "error: "+e); - } - if (noNodesActive(network)) { - // if no nodes are running, then the network is stopped - networkDAO.update(network.setState(BubbleNetworkState.stopped)); + if (e instanceof SleepInterruptedException) { + log.warn("newNode: interrupted!"); + } else { + log.error("newNode: " + e, e); } + killNode = node != null; if (e instanceof NodeLaunchException) throw (NodeLaunchException) e; + if (e instanceof SleepInterruptedException) launchInterrupted("newNode: interrupted: "+shortError(e)); return die("newNode: "+e, e); } finally { @@ -434,7 +431,7 @@ public class StandardNetworkService implements NetworkService { } } - if (node != null && !node.isRunning()) { + if (node != null && (killNode || !node.isRunning())) { node.setState(BubbleNodeState.unknown_error); nodeDAO.update(node); if (!progressMeter.hasError()) progressMeter.error(METER_UNKNOWN_ERROR); diff --git a/bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java b/bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java index 6c7b3362..b036acbc 100644 --- a/bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java +++ b/bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java @@ -8,11 +8,13 @@ import bubble.cloud.compute.ComputeServiceDriver; import bubble.dao.cloud.BubbleNodeDAO; import bubble.model.cloud.BubbleNode; import bubble.model.cloud.BubbleNodeState; +import lombok.extern.slf4j.Slf4j; import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_NO_IP; import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_STARTING_NODE; import static org.cobbzilla.util.daemon.ZillaRuntime.shortError; +@Slf4j public class NodeStartJob implements Runnable { private BubbleNode node; @@ -31,7 +33,11 @@ public class NodeStartJob implements Runnable { try { node.setState(BubbleNodeState.booting); nodeDAO.update(node); + + log.debug("run: calling computeDriver.start("+node.id()+")"); node = computeDriver.start(node); + log.debug("run: computeDriver.start("+node.id()+") returned successfully"); + node.setState(BubbleNodeState.booted); nodeDAO.update(node); diff --git a/bubble-server/src/main/java/bubble/service/notify/NotificationService.java b/bubble-server/src/main/java/bubble/service/notify/NotificationService.java index 32d61df5..64a0671a 100644 --- a/bubble-server/src/main/java/bubble/service/notify/NotificationService.java +++ b/bubble-server/src/main/java/bubble/service/notify/NotificationService.java @@ -22,6 +22,7 @@ import org.cobbzilla.wizard.util.RestResponse; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import javax.net.ssl.SSLException; import java.net.ConnectException; import java.net.UnknownHostException; import java.util.List; @@ -123,17 +124,30 @@ public class NotificationService { log.debug("_notify: <<<<< RECEIPT <<<<<< " + json(receipt, COMPACT_MAPPER) + " <<<<<<<<<<<<<<<<<<"); return receipt; - } catch (ConnectException | ConnectTimeoutException | UnknownHostException | ApiException e) { + } catch (ConnectException | ConnectTimeoutException | UnknownHostException | SSLException | ApiException e) { notification.setStatus(NotificationSendStatus.error); notification.setException(e); sentNotificationDAO.update(notification); - throw new IllegalStateException("_notify: "+shortError(e), e); + return handleNotifyException(notification, e, true); } catch (Exception e) { notification.setStatus(NotificationSendStatus.error); notification.setException(e); sentNotificationDAO.update(notification); - return die("_notify: "+shortError(e), e); + return handleNotifyException(notification, e, true); + } + } + } + + public NotificationReceipt handleNotifyException(SentNotification notification, Exception e, boolean die) { + if (notification.getType() == NotificationType.health_check) { + log.error("_notify: health check failed for node "+notification.getToNode()+": "+shortError(e)); + return null; + } else { + if (die) { + return die("_notify: " + shortError(e), e); + } else { + throw new IllegalStateException("_notify: "+shortError(e), e); } } } diff --git a/utils/cobbzilla-utils b/utils/cobbzilla-utils index 009a52ed..1594a1ad 160000 --- a/utils/cobbzilla-utils +++ b/utils/cobbzilla-utils @@ -1 +1 @@ -Subproject commit 009a52edb53315fcb7d90c9feed382970aa9a4b8 +Subproject commit 1594a1ade170fd9b690682894f9b5f410659548e