@@ -33,6 +33,7 @@ import static org.cobbzilla.util.daemon.ZillaRuntime.*; | |||
import static org.cobbzilla.util.http.HttpMethods.POST; | |||
import static org.cobbzilla.util.http.HttpStatusCodes.*; | |||
import static org.cobbzilla.util.http.HttpUtil.getResponse; | |||
import static org.cobbzilla.util.json.JsonUtil.COMPACT_MAPPER; | |||
import static org.cobbzilla.util.json.JsonUtil.json; | |||
import static org.cobbzilla.util.system.Sleep.sleep; | |||
import static org.cobbzilla.wizard.resources.ResourceUtil.invalidEx; | |||
@@ -129,6 +130,7 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
final HttpRequestBean serverRequest = auth(new HttpRequestBean(POST, CREATE_SERVER_URL, data)); | |||
// create server, check response | |||
if (log.isInfoEnabled()) log.info("start: calling Vultr to start node: "+node.id()); | |||
final HttpResponseBean serverResponse = serverRequest.curl(); // fixme: we can do better than shelling to curl | |||
if (serverResponse.getStatus() != 200) return die("start: error creating server: " + serverResponse); | |||
final JsonNode responseJson; | |||
@@ -138,6 +140,7 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
return die("start: error creating server (error parsing response as JSON): " + serverResponse); | |||
} | |||
final var subId = responseJson.get(VULTR_SUBID).textValue(); | |||
if (log.isDebugEnabled()) log.debug("start: Vultr started node: "+node.id()+" SUBID="+subId); | |||
node.setState(BubbleNodeState.booting); | |||
node.setTag(TAG_INSTANCE_ID, subId); | |||
@@ -151,10 +154,10 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
sleep(SERVER_START_POLL_INTERVAL); | |||
final HttpResponseBean pollResponse = getResponse(poll); | |||
if (pollResponse.getStatus() != OK) { | |||
return die("start: error polling subid: "+subId+": "+pollResponse); | |||
return die("start: error polling node "+node.id()+" subid: "+subId+": "+pollResponse); | |||
} | |||
// todo: add timeout, if server doesn't come up within X minutes, try to kill it and report an error | |||
final JsonNode serverNode = json(pollResponse.getEntityString(), JsonNode.class); | |||
if (log.isDebugEnabled()) log.debug("start: polled node "+node.id()+" json="+json(serverNode, COMPACT_MAPPER)); | |||
if (serverNode != null) { | |||
if (serverNode.has("tag") | |||
&& serverNode.get("tag").textValue().equals(cloud.getUuid()) | |||
@@ -166,7 +169,7 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
final String serverState = serverNode.get("server_state").textValue(); | |||
final String ip4 = serverNode.get(VULTR_V4_IP).textValue(); | |||
final String ip6 = serverNode.get(VULTR_V6_IP).textValue(); | |||
// log.info("start: server_state="+serverState+", status="+status, "ip4="+ip4+", ip6="+ip6); | |||
// if (log.isInfoEnabled()) log.info("start: server_state="+serverState+", status="+status, "ip4="+ip4+", ip6="+ip6); | |||
if (ip4 != null && ip4.length() > 0 && !ip4.equals("0.0.0.0")) { | |||
node.setIp4(ip4); | |||
@@ -181,7 +184,7 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
nodeDAO.update(node); | |||
} | |||
if (serverState.equals("ok")) { | |||
log.info("start: server is ready: "+node.id()); | |||
if (log.isInfoEnabled()) log.info("start: server is ready: "+node.id()); | |||
startedOk = true; | |||
break; | |||
} | |||
@@ -189,7 +192,7 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
} | |||
} | |||
if (!startedOk) { | |||
log.error("start: timeout waiting for node to boot and become available, stopping it"); | |||
if (log.isErrorEnabled()) log.error("start: timeout waiting for node "+node.id()+" to boot and become available, stopping it"); | |||
stop(node); | |||
} | |||
return node; | |||
@@ -210,16 +213,16 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
try { | |||
_stop(node); | |||
} catch (EntityNotFoundException e) { | |||
log.info("stop: node stopped"); | |||
if (log.isInfoEnabled()) log.info("stop: node stopped"); | |||
return node; | |||
} catch (Exception e) { | |||
lastEx = e; | |||
} | |||
sleep(SERVER_STOP_CHECK_INTERVAL, "stop: waiting to try stopping again until node is not found"); | |||
log.warn("stop: node still running: "+node.id()); | |||
if (log.isWarnEnabled()) log.warn("stop: node still running: "+node.id()); | |||
} | |||
log.error("stop: error stopping node: "+node.id()); | |||
if (log.isErrorEnabled()) log.error("stop: error stopping node: "+node.id()); | |||
if (lastEx != null) throw lastEx; | |||
return die("stop: timeout stopping node: "+node.id()); | |||
} | |||
@@ -231,7 +234,7 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
if (ip4 == null) { | |||
throw notFoundEx(node.id()); | |||
} | |||
log.warn("stop: no "+TAG_INSTANCE_ID+" tag found on node ("+node.getFqdn()+"/"+ ip4 +"), searching based in ip4..."); | |||
if (log.isWarnEnabled()) log.warn("stop: no "+TAG_INSTANCE_ID+" tag found on node ("+node.getFqdn()+"/"+ ip4 +"), searching based in ip4..."); | |||
vultrNode = findByIp4(node, ip4); | |||
} else { | |||
// does the node still exist? | |||
@@ -267,11 +270,11 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
.findFirst() | |||
.orElse(null); | |||
if (found == null) { | |||
log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +") and no server had this ip4"); | |||
if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +") and no server had this ip4"); | |||
return null; | |||
} | |||
if (!found.hasTag(TAG_INSTANCE_ID)) { | |||
log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +"), cannot stop"); | |||
if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +"), cannot stop"); | |||
return null; | |||
} | |||
return found; | |||
@@ -292,12 +295,12 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
|| (ip4 != null && node.hasIp4() && ip4.textValue().equals(node.getIp4())) | |||
|| (ip6 != null && node.hasIp6() && ip6.textValue().equals(node.getIp6())) ? node : null; | |||
} catch (Exception e) { | |||
log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse+": exception="+shortError(e)); | |||
if (log.isErrorEnabled()) log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse+": exception="+shortError(e)); | |||
return null; | |||
} | |||
case NOT_FOUND: return null; | |||
default: | |||
log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse); | |||
if (log.isErrorEnabled()) log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse); | |||
return null; | |||
} | |||
} | |||
@@ -320,7 +323,7 @@ public class VultrDriver extends ComputeServiceDriverBase { | |||
final String subid = iter.next(); | |||
final ObjectNode server = (ObjectNode) entity.get(subid); | |||
if (!filter.apply(server)) { | |||
log.debug("Skipping node without cloud tag "+cloud.getUuid()+": "+subid); | |||
if (log.isDebugEnabled()) log.debug("Skipping node without cloud tag "+cloud.getUuid()+": "+subid); | |||
continue; | |||
} | |||
final String subId = server.has(VULTR_SUBID) ? server.get(VULTR_SUBID).textValue() : null; | |||
@@ -9,50 +9,59 @@ import lombok.Getter; | |||
public class NodeLaunchException extends RuntimeException { | |||
public enum NodeLaunchExceptionType { fatal, canRetry, interrupted } | |||
@Getter private final BubbleNode node; | |||
public boolean hasNode () { return node != null; } | |||
public String nodeSummary () { return node == null ? "null" : node.id()+"/"+node.getState(); } | |||
@Getter private final boolean fatal; | |||
@Getter private final NodeLaunchExceptionType type; | |||
private NodeLaunchException (BubbleNode node, Exception e, String message, boolean fatal) { | |||
private NodeLaunchException (BubbleNode node, Exception e, String message, NodeLaunchExceptionType type) { | |||
super(message, e); | |||
this.node = node; | |||
this.fatal = fatal; | |||
this.type = type; | |||
} | |||
private NodeLaunchException (BubbleNode node, Exception e, boolean fatal) { | |||
this(node, e, e.getMessage(), fatal); | |||
private NodeLaunchException (BubbleNode node, Exception e, NodeLaunchExceptionType type) { | |||
this(node, e, e.getMessage(), type); | |||
} | |||
private NodeLaunchException (BubbleNode node, String message, boolean fatal) { | |||
this(node, null, message, fatal); | |||
private NodeLaunchException (BubbleNode node, String message, NodeLaunchExceptionType type) { | |||
this(node, null, message, type); | |||
} | |||
private NodeLaunchException (Exception e, String message, boolean fatal) { | |||
this(null, e, message, fatal); | |||
private NodeLaunchException (Exception e, String message, NodeLaunchExceptionType type) { | |||
this(null, e, message, type); | |||
} | |||
private NodeLaunchException (String message, boolean fatal) { | |||
this(null, null, message, fatal); | |||
private NodeLaunchException (String message, NodeLaunchExceptionType type) { | |||
this(null, null, message, type); | |||
} | |||
private NodeLaunchException (Exception e, boolean fatal) { | |||
this(null, e, e.getMessage(), fatal); | |||
private NodeLaunchException (Exception e, NodeLaunchExceptionType type) { | |||
this(null, e, e.getMessage(), type); | |||
} | |||
public static <T> T fatalLaunchFailure (String message) { throw new NodeLaunchException(message, true); } | |||
public static <T> T fatalLaunchFailure (Exception e, String message) { throw new NodeLaunchException(e, message, true); } | |||
public static <T> T fatalLaunchFailure (Exception e) { throw new NodeLaunchException(e, true); } | |||
public static <T> T fatalLaunchFailure (BubbleNode node, String message) { throw new NodeLaunchException(node, message, true); } | |||
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, true); } | |||
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, true); } | |||
public static <T> T launchFailureCanRetry (String message) { throw new NodeLaunchException(message, false); } | |||
public static <T> T launchFailureCanRetry (Exception e, String message) { throw new NodeLaunchException(e, message, false); } | |||
public static <T> T launchFailureCanRetry (Exception e) { throw new NodeLaunchException(e, false); } | |||
public static <T> T launchFailureCanRetry (BubbleNode node, String message) { throw new NodeLaunchException(node, message, false); } | |||
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, false); } | |||
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, false); } | |||
public static <T> T fatalLaunchFailure (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.fatal); } | |||
public static <T> T fatalLaunchFailure (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.fatal); } | |||
public static <T> T fatalLaunchFailure (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.fatal); } | |||
public static <T> T fatalLaunchFailure (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.fatal); } | |||
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.fatal); } | |||
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.fatal); } | |||
public static <T> T launchFailureCanRetry (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.canRetry); } | |||
public static <T> T launchFailureCanRetry (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.canRetry); } | |||
public static <T> T launchFailureCanRetry (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.canRetry); } | |||
public static <T> T launchFailureCanRetry (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.canRetry); } | |||
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.canRetry); } | |||
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.canRetry); } | |||
public static <T> T launchInterrupted (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.interrupted); } | |||
public static <T> T launchInterrupted (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.interrupted); } | |||
public static <T> T launchInterrupted (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.interrupted); } | |||
public static <T> T launchInterrupted (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.interrupted); } | |||
public static <T> T launchInterrupted (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.interrupted); } | |||
public static <T> T launchInterrupted (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.interrupted); } | |||
} |
@@ -58,10 +58,18 @@ public class NodeLauncher implements Runnable { | |||
if (exception != null) { | |||
if (exception instanceof NodeLaunchException) { | |||
final NodeLaunchException launchException = (NodeLaunchException) exception; | |||
if (launchException.isFatal()) { | |||
die("NodeLauncher.run: fatal launch exception: " + shortError(launchException)); | |||
} else { | |||
log.warn("NodeLauncher.run: nonfatal launch exception for node " + launchException.nodeSummary() + " : " + shortError(launchException)); | |||
switch (launchException.getType()) { | |||
case fatal: | |||
die("NodeLauncher.run: fatal launch exception: " + shortError(launchException)); | |||
break; | |||
case interrupted: | |||
log.warn("NodeLauncher.run: launch interrupted, exiting early"); | |||
return; | |||
case canRetry: | |||
log.warn("NodeLauncher.run: nonfatal launch exception for node " + launchException.nodeSummary() + " : " + shortError(launchException)); | |||
break; | |||
default: | |||
die("NodeLauncher.run: unknown launch exception (type="+launchException.getType()+"): "+shortError(launchException)); | |||
} | |||
} else { | |||
die("NodeLauncher.run: fatal launch exception: " + shortError(exception)); | |||
@@ -49,6 +49,7 @@ import org.cobbzilla.util.io.TempDir; | |||
import org.cobbzilla.util.system.Command; | |||
import org.cobbzilla.util.system.CommandResult; | |||
import org.cobbzilla.util.system.CommandShell; | |||
import org.cobbzilla.util.system.SleepInterruptedException; | |||
import org.cobbzilla.wizard.api.ApiException; | |||
import org.cobbzilla.wizard.cache.redis.RedisService; | |||
import org.cobbzilla.wizard.validation.MultiViolationException; | |||
@@ -76,8 +77,7 @@ import static bubble.model.cloud.BubbleNode.TAG_ERROR; | |||
import static bubble.server.BubbleConfiguration.DEBUG_NODE_INSTALL_FILE; | |||
import static bubble.server.BubbleConfiguration.ENV_DEBUG_NODE_INSTALL; | |||
import static bubble.service.boot.StandardSelfNodeService.*; | |||
import static bubble.service.cloud.NodeLaunchException.fatalLaunchFailure; | |||
import static bubble.service.cloud.NodeLaunchException.launchFailureCanRetry; | |||
import static bubble.service.cloud.NodeLaunchException.*; | |||
import static bubble.service.cloud.NodeProgressMeterConstants.*; | |||
import static java.util.concurrent.TimeUnit.MINUTES; | |||
import static java.util.concurrent.TimeUnit.SECONDS; | |||
@@ -148,6 +148,7 @@ public class StandardNetworkService implements NetworkService { | |||
NodeProgressMeter progressMeter = null; | |||
final BubbleNetwork network = nn.getNetworkObject(); | |||
final ExecutorService backgroundJobs = DaemonThreadFactory.fixedPool(3); | |||
boolean killNode = false; | |||
try { | |||
progressMeter = launchMonitor.getProgressMeter(nn); | |||
progressMeter.write(METER_TICK_CONFIRMING_NETWORK_LOCK); | |||
@@ -410,18 +411,14 @@ public class StandardNetworkService implements NetworkService { | |||
log.info("newNode: ready in "+formatDuration(now() - start)); | |||
} catch (Exception e) { | |||
log.error("newNode: "+e, e); | |||
if (node != null) { | |||
node.setState(BubbleNodeState.unknown_error); | |||
nodeDAO.update(node); | |||
if (!progressMeter.hasError()) progressMeter.error(METER_UNKNOWN_ERROR); | |||
killNode(node, "error: "+e); | |||
} | |||
if (noNodesActive(network)) { | |||
// if no nodes are running, then the network is stopped | |||
networkDAO.update(network.setState(BubbleNetworkState.stopped)); | |||
if (e instanceof SleepInterruptedException) { | |||
log.warn("newNode: interrupted!"); | |||
} else { | |||
log.error("newNode: " + e, e); | |||
} | |||
killNode = node != null; | |||
if (e instanceof NodeLaunchException) throw (NodeLaunchException) e; | |||
if (e instanceof SleepInterruptedException) launchInterrupted("newNode: interrupted: "+shortError(e)); | |||
return die("newNode: "+e, e); | |||
} finally { | |||
@@ -434,7 +431,7 @@ public class StandardNetworkService implements NetworkService { | |||
} | |||
} | |||
if (node != null && !node.isRunning()) { | |||
if (node != null && (killNode || !node.isRunning())) { | |||
node.setState(BubbleNodeState.unknown_error); | |||
nodeDAO.update(node); | |||
if (!progressMeter.hasError()) progressMeter.error(METER_UNKNOWN_ERROR); | |||
@@ -8,11 +8,13 @@ import bubble.cloud.compute.ComputeServiceDriver; | |||
import bubble.dao.cloud.BubbleNodeDAO; | |||
import bubble.model.cloud.BubbleNode; | |||
import bubble.model.cloud.BubbleNodeState; | |||
import lombok.extern.slf4j.Slf4j; | |||
import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_NO_IP; | |||
import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_STARTING_NODE; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError; | |||
@Slf4j | |||
public class NodeStartJob implements Runnable { | |||
private BubbleNode node; | |||
@@ -31,7 +33,11 @@ public class NodeStartJob implements Runnable { | |||
try { | |||
node.setState(BubbleNodeState.booting); | |||
nodeDAO.update(node); | |||
log.debug("run: calling computeDriver.start("+node.id()+")"); | |||
node = computeDriver.start(node); | |||
log.debug("run: computeDriver.start("+node.id()+") returned successfully"); | |||
node.setState(BubbleNodeState.booted); | |||
nodeDAO.update(node); | |||
@@ -22,6 +22,7 @@ import org.cobbzilla.wizard.util.RestResponse; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.stereotype.Service; | |||
import javax.net.ssl.SSLException; | |||
import java.net.ConnectException; | |||
import java.net.UnknownHostException; | |||
import java.util.List; | |||
@@ -123,17 +124,30 @@ public class NotificationService { | |||
log.debug("_notify: <<<<< RECEIPT <<<<<< " + json(receipt, COMPACT_MAPPER) + " <<<<<<<<<<<<<<<<<<"); | |||
return receipt; | |||
} catch (ConnectException | ConnectTimeoutException | UnknownHostException | ApiException e) { | |||
} catch (ConnectException | ConnectTimeoutException | UnknownHostException | SSLException | ApiException e) { | |||
notification.setStatus(NotificationSendStatus.error); | |||
notification.setException(e); | |||
sentNotificationDAO.update(notification); | |||
throw new IllegalStateException("_notify: "+shortError(e), e); | |||
return handleNotifyException(notification, e, true); | |||
} catch (Exception e) { | |||
notification.setStatus(NotificationSendStatus.error); | |||
notification.setException(e); | |||
sentNotificationDAO.update(notification); | |||
return die("_notify: "+shortError(e), e); | |||
return handleNotifyException(notification, e, true); | |||
} | |||
} | |||
} | |||
public NotificationReceipt handleNotifyException(SentNotification notification, Exception e, boolean die) { | |||
if (notification.getType() == NotificationType.health_check) { | |||
log.error("_notify: health check failed for node "+notification.getToNode()+": "+shortError(e)); | |||
return null; | |||
} else { | |||
if (die) { | |||
return die("_notify: " + shortError(e), e); | |||
} else { | |||
throw new IllegalStateException("_notify: "+shortError(e), e); | |||
} | |||
} | |||
} | |||
@@ -1 +1 @@ | |||
Subproject commit 009a52edb53315fcb7d90c9feed382970aa9a4b8 | |||
Subproject commit 1594a1ade170fd9b690682894f9b5f410659548e |