Преглед изворни кода

improve interrupted launch handling

tags/v0.14.0
Jonathan Cobb пре 4 година
родитељ
комит
4a312b0122
7 измењених фајлова са 98 додато и 61 уклоњено
  1. +17
    -14
      bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java
  2. +35
    -26
      bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java
  3. +12
    -4
      bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java
  4. +10
    -13
      bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java
  5. +6
    -0
      bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java
  6. +17
    -3
      bubble-server/src/main/java/bubble/service/notify/NotificationService.java
  7. +1
    -1
      utils/cobbzilla-utils

+ 17
- 14
bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java Прегледај датотеку

@@ -33,6 +33,7 @@ import static org.cobbzilla.util.daemon.ZillaRuntime.*;
import static org.cobbzilla.util.http.HttpMethods.POST;
import static org.cobbzilla.util.http.HttpStatusCodes.*;
import static org.cobbzilla.util.http.HttpUtil.getResponse;
import static org.cobbzilla.util.json.JsonUtil.COMPACT_MAPPER;
import static org.cobbzilla.util.json.JsonUtil.json;
import static org.cobbzilla.util.system.Sleep.sleep;
import static org.cobbzilla.wizard.resources.ResourceUtil.invalidEx;
@@ -129,6 +130,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
final HttpRequestBean serverRequest = auth(new HttpRequestBean(POST, CREATE_SERVER_URL, data));

// create server, check response
if (log.isInfoEnabled()) log.info("start: calling Vultr to start node: "+node.id());
final HttpResponseBean serverResponse = serverRequest.curl(); // fixme: we can do better than shelling to curl
if (serverResponse.getStatus() != 200) return die("start: error creating server: " + serverResponse);
final JsonNode responseJson;
@@ -138,6 +140,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
return die("start: error creating server (error parsing response as JSON): " + serverResponse);
}
final var subId = responseJson.get(VULTR_SUBID).textValue();
if (log.isDebugEnabled()) log.debug("start: Vultr started node: "+node.id()+" SUBID="+subId);

node.setState(BubbleNodeState.booting);
node.setTag(TAG_INSTANCE_ID, subId);
@@ -151,10 +154,10 @@ public class VultrDriver extends ComputeServiceDriverBase {
sleep(SERVER_START_POLL_INTERVAL);
final HttpResponseBean pollResponse = getResponse(poll);
if (pollResponse.getStatus() != OK) {
return die("start: error polling subid: "+subId+": "+pollResponse);
return die("start: error polling node "+node.id()+" subid: "+subId+": "+pollResponse);
}
// todo: add timeout, if server doesn't come up within X minutes, try to kill it and report an error
final JsonNode serverNode = json(pollResponse.getEntityString(), JsonNode.class);
if (log.isDebugEnabled()) log.debug("start: polled node "+node.id()+" json="+json(serverNode, COMPACT_MAPPER));
if (serverNode != null) {
if (serverNode.has("tag")
&& serverNode.get("tag").textValue().equals(cloud.getUuid())
@@ -166,7 +169,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
final String serverState = serverNode.get("server_state").textValue();
final String ip4 = serverNode.get(VULTR_V4_IP).textValue();
final String ip6 = serverNode.get(VULTR_V6_IP).textValue();
// log.info("start: server_state="+serverState+", status="+status, "ip4="+ip4+", ip6="+ip6);
// if (log.isInfoEnabled()) log.info("start: server_state="+serverState+", status="+status, "ip4="+ip4+", ip6="+ip6);

if (ip4 != null && ip4.length() > 0 && !ip4.equals("0.0.0.0")) {
node.setIp4(ip4);
@@ -181,7 +184,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
nodeDAO.update(node);
}
if (serverState.equals("ok")) {
log.info("start: server is ready: "+node.id());
if (log.isInfoEnabled()) log.info("start: server is ready: "+node.id());
startedOk = true;
break;
}
@@ -189,7 +192,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
}
}
if (!startedOk) {
log.error("start: timeout waiting for node to boot and become available, stopping it");
if (log.isErrorEnabled()) log.error("start: timeout waiting for node "+node.id()+" to boot and become available, stopping it");
stop(node);
}
return node;
@@ -210,16 +213,16 @@ public class VultrDriver extends ComputeServiceDriverBase {
try {
_stop(node);
} catch (EntityNotFoundException e) {
log.info("stop: node stopped");
if (log.isInfoEnabled()) log.info("stop: node stopped");
return node;

} catch (Exception e) {
lastEx = e;
}
sleep(SERVER_STOP_CHECK_INTERVAL, "stop: waiting to try stopping again until node is not found");
log.warn("stop: node still running: "+node.id());
if (log.isWarnEnabled()) log.warn("stop: node still running: "+node.id());
}
log.error("stop: error stopping node: "+node.id());
if (log.isErrorEnabled()) log.error("stop: error stopping node: "+node.id());
if (lastEx != null) throw lastEx;
return die("stop: timeout stopping node: "+node.id());
}
@@ -231,7 +234,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
if (ip4 == null) {
throw notFoundEx(node.id());
}
log.warn("stop: no "+TAG_INSTANCE_ID+" tag found on node ("+node.getFqdn()+"/"+ ip4 +"), searching based in ip4...");
if (log.isWarnEnabled()) log.warn("stop: no "+TAG_INSTANCE_ID+" tag found on node ("+node.getFqdn()+"/"+ ip4 +"), searching based in ip4...");
vultrNode = findByIp4(node, ip4);
} else {
// does the node still exist?
@@ -267,11 +270,11 @@ public class VultrDriver extends ComputeServiceDriverBase {
.findFirst()
.orElse(null);
if (found == null) {
log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +") and no server had this ip4");
if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +") and no server had this ip4");
return null;
}
if (!found.hasTag(TAG_INSTANCE_ID)) {
log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +"), cannot stop");
if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +"), cannot stop");
return null;
}
return found;
@@ -292,12 +295,12 @@ public class VultrDriver extends ComputeServiceDriverBase {
|| (ip4 != null && node.hasIp4() && ip4.textValue().equals(node.getIp4()))
|| (ip6 != null && node.hasIp6() && ip6.textValue().equals(node.getIp6())) ? node : null;
} catch (Exception e) {
log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse+": exception="+shortError(e));
if (log.isErrorEnabled()) log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse+": exception="+shortError(e));
return null;
}
case NOT_FOUND: return null;
default:
log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse);
if (log.isErrorEnabled()) log.error("listNode: error finding node "+node.id()+", status="+listResponse.getStatus()+": "+listResponse);
return null;
}
}
@@ -320,7 +323,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
final String subid = iter.next();
final ObjectNode server = (ObjectNode) entity.get(subid);
if (!filter.apply(server)) {
log.debug("Skipping node without cloud tag "+cloud.getUuid()+": "+subid);
if (log.isDebugEnabled()) log.debug("Skipping node without cloud tag "+cloud.getUuid()+": "+subid);
continue;
}
final String subId = server.has(VULTR_SUBID) ? server.get(VULTR_SUBID).textValue() : null;


+ 35
- 26
bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java Прегледај датотеку

@@ -9,50 +9,59 @@ import lombok.Getter;

public class NodeLaunchException extends RuntimeException {

public enum NodeLaunchExceptionType { fatal, canRetry, interrupted }

@Getter private final BubbleNode node;
public boolean hasNode () { return node != null; }
public String nodeSummary () { return node == null ? "null" : node.id()+"/"+node.getState(); }

@Getter private final boolean fatal;
@Getter private final NodeLaunchExceptionType type;

private NodeLaunchException (BubbleNode node, Exception e, String message, boolean fatal) {
private NodeLaunchException (BubbleNode node, Exception e, String message, NodeLaunchExceptionType type) {
super(message, e);
this.node = node;
this.fatal = fatal;
this.type = type;
}

private NodeLaunchException (BubbleNode node, Exception e, boolean fatal) {
this(node, e, e.getMessage(), fatal);
private NodeLaunchException (BubbleNode node, Exception e, NodeLaunchExceptionType type) {
this(node, e, e.getMessage(), type);
}

private NodeLaunchException (BubbleNode node, String message, boolean fatal) {
this(node, null, message, fatal);
private NodeLaunchException (BubbleNode node, String message, NodeLaunchExceptionType type) {
this(node, null, message, type);
}

private NodeLaunchException (Exception e, String message, boolean fatal) {
this(null, e, message, fatal);
private NodeLaunchException (Exception e, String message, NodeLaunchExceptionType type) {
this(null, e, message, type);
}

private NodeLaunchException (String message, boolean fatal) {
this(null, null, message, fatal);
private NodeLaunchException (String message, NodeLaunchExceptionType type) {
this(null, null, message, type);
}

private NodeLaunchException (Exception e, boolean fatal) {
this(null, e, e.getMessage(), fatal);
private NodeLaunchException (Exception e, NodeLaunchExceptionType type) {
this(null, e, e.getMessage(), type);
}

public static <T> T fatalLaunchFailure (String message) { throw new NodeLaunchException(message, true); }
public static <T> T fatalLaunchFailure (Exception e, String message) { throw new NodeLaunchException(e, message, true); }
public static <T> T fatalLaunchFailure (Exception e) { throw new NodeLaunchException(e, true); }
public static <T> T fatalLaunchFailure (BubbleNode node, String message) { throw new NodeLaunchException(node, message, true); }
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, true); }
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, true); }

public static <T> T launchFailureCanRetry (String message) { throw new NodeLaunchException(message, false); }
public static <T> T launchFailureCanRetry (Exception e, String message) { throw new NodeLaunchException(e, message, false); }
public static <T> T launchFailureCanRetry (Exception e) { throw new NodeLaunchException(e, false); }
public static <T> T launchFailureCanRetry (BubbleNode node, String message) { throw new NodeLaunchException(node, message, false); }
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, false); }
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, false); }
public static <T> T fatalLaunchFailure (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.fatal); }
public static <T> T fatalLaunchFailure (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.fatal); }
public static <T> T fatalLaunchFailure (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.fatal); }
public static <T> T fatalLaunchFailure (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.fatal); }
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.fatal); }
public static <T> T fatalLaunchFailure (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.fatal); }

public static <T> T launchFailureCanRetry (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.canRetry); }
public static <T> T launchFailureCanRetry (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.canRetry); }
public static <T> T launchFailureCanRetry (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.canRetry); }
public static <T> T launchFailureCanRetry (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.canRetry); }
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.canRetry); }
public static <T> T launchFailureCanRetry (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.canRetry); }

public static <T> T launchInterrupted (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.interrupted); }
public static <T> T launchInterrupted (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.interrupted); }
public static <T> T launchInterrupted (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.interrupted); }
public static <T> T launchInterrupted (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.interrupted); }
public static <T> T launchInterrupted (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.interrupted); }
public static <T> T launchInterrupted (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.interrupted); }

}

+ 12
- 4
bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java Прегледај датотеку

@@ -58,10 +58,18 @@ public class NodeLauncher implements Runnable {
if (exception != null) {
if (exception instanceof NodeLaunchException) {
final NodeLaunchException launchException = (NodeLaunchException) exception;
if (launchException.isFatal()) {
die("NodeLauncher.run: fatal launch exception: " + shortError(launchException));
} else {
log.warn("NodeLauncher.run: nonfatal launch exception for node " + launchException.nodeSummary() + " : " + shortError(launchException));
switch (launchException.getType()) {
case fatal:
die("NodeLauncher.run: fatal launch exception: " + shortError(launchException));
break;
case interrupted:
log.warn("NodeLauncher.run: launch interrupted, exiting early");
return;
case canRetry:
log.warn("NodeLauncher.run: nonfatal launch exception for node " + launchException.nodeSummary() + " : " + shortError(launchException));
break;
default:
die("NodeLauncher.run: unknown launch exception (type="+launchException.getType()+"): "+shortError(launchException));
}
} else {
die("NodeLauncher.run: fatal launch exception: " + shortError(exception));


+ 10
- 13
bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java Прегледај датотеку

@@ -49,6 +49,7 @@ import org.cobbzilla.util.io.TempDir;
import org.cobbzilla.util.system.Command;
import org.cobbzilla.util.system.CommandResult;
import org.cobbzilla.util.system.CommandShell;
import org.cobbzilla.util.system.SleepInterruptedException;
import org.cobbzilla.wizard.api.ApiException;
import org.cobbzilla.wizard.cache.redis.RedisService;
import org.cobbzilla.wizard.validation.MultiViolationException;
@@ -76,8 +77,7 @@ import static bubble.model.cloud.BubbleNode.TAG_ERROR;
import static bubble.server.BubbleConfiguration.DEBUG_NODE_INSTALL_FILE;
import static bubble.server.BubbleConfiguration.ENV_DEBUG_NODE_INSTALL;
import static bubble.service.boot.StandardSelfNodeService.*;
import static bubble.service.cloud.NodeLaunchException.fatalLaunchFailure;
import static bubble.service.cloud.NodeLaunchException.launchFailureCanRetry;
import static bubble.service.cloud.NodeLaunchException.*;
import static bubble.service.cloud.NodeProgressMeterConstants.*;
import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.SECONDS;
@@ -148,6 +148,7 @@ public class StandardNetworkService implements NetworkService {
NodeProgressMeter progressMeter = null;
final BubbleNetwork network = nn.getNetworkObject();
final ExecutorService backgroundJobs = DaemonThreadFactory.fixedPool(3);
boolean killNode = false;
try {
progressMeter = launchMonitor.getProgressMeter(nn);
progressMeter.write(METER_TICK_CONFIRMING_NETWORK_LOCK);
@@ -410,18 +411,14 @@ public class StandardNetworkService implements NetworkService {
log.info("newNode: ready in "+formatDuration(now() - start));

} catch (Exception e) {
log.error("newNode: "+e, e);
if (node != null) {
node.setState(BubbleNodeState.unknown_error);
nodeDAO.update(node);
if (!progressMeter.hasError()) progressMeter.error(METER_UNKNOWN_ERROR);
killNode(node, "error: "+e);
}
if (noNodesActive(network)) {
// if no nodes are running, then the network is stopped
networkDAO.update(network.setState(BubbleNetworkState.stopped));
if (e instanceof SleepInterruptedException) {
log.warn("newNode: interrupted!");
} else {
log.error("newNode: " + e, e);
}
killNode = node != null;
if (e instanceof NodeLaunchException) throw (NodeLaunchException) e;
if (e instanceof SleepInterruptedException) launchInterrupted("newNode: interrupted: "+shortError(e));
return die("newNode: "+e, e);

} finally {
@@ -434,7 +431,7 @@ public class StandardNetworkService implements NetworkService {
}
}

if (node != null && !node.isRunning()) {
if (node != null && (killNode || !node.isRunning())) {
node.setState(BubbleNodeState.unknown_error);
nodeDAO.update(node);
if (!progressMeter.hasError()) progressMeter.error(METER_UNKNOWN_ERROR);


+ 6
- 0
bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java Прегледај датотеку

@@ -8,11 +8,13 @@ import bubble.cloud.compute.ComputeServiceDriver;
import bubble.dao.cloud.BubbleNodeDAO;
import bubble.model.cloud.BubbleNode;
import bubble.model.cloud.BubbleNodeState;
import lombok.extern.slf4j.Slf4j;

import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_NO_IP;
import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_STARTING_NODE;
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;

@Slf4j
public class NodeStartJob implements Runnable {

private BubbleNode node;
@@ -31,7 +33,11 @@ public class NodeStartJob implements Runnable {
try {
node.setState(BubbleNodeState.booting);
nodeDAO.update(node);

log.debug("run: calling computeDriver.start("+node.id()+")");
node = computeDriver.start(node);
log.debug("run: computeDriver.start("+node.id()+") returned successfully");

node.setState(BubbleNodeState.booted);
nodeDAO.update(node);



+ 17
- 3
bubble-server/src/main/java/bubble/service/notify/NotificationService.java Прегледај датотеку

@@ -22,6 +22,7 @@ import org.cobbzilla.wizard.util.RestResponse;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import javax.net.ssl.SSLException;
import java.net.ConnectException;
import java.net.UnknownHostException;
import java.util.List;
@@ -123,17 +124,30 @@ public class NotificationService {
log.debug("_notify: <<<<< RECEIPT <<<<<< " + json(receipt, COMPACT_MAPPER) + " <<<<<<<<<<<<<<<<<<");
return receipt;

} catch (ConnectException | ConnectTimeoutException | UnknownHostException | ApiException e) {
} catch (ConnectException | ConnectTimeoutException | UnknownHostException | SSLException | ApiException e) {
notification.setStatus(NotificationSendStatus.error);
notification.setException(e);
sentNotificationDAO.update(notification);
throw new IllegalStateException("_notify: "+shortError(e), e);
return handleNotifyException(notification, e, true);

} catch (Exception e) {
notification.setStatus(NotificationSendStatus.error);
notification.setException(e);
sentNotificationDAO.update(notification);
return die("_notify: "+shortError(e), e);
return handleNotifyException(notification, e, true);
}
}
}

public NotificationReceipt handleNotifyException(SentNotification notification, Exception e, boolean die) {
if (notification.getType() == NotificationType.health_check) {
log.error("_notify: health check failed for node "+notification.getToNode()+": "+shortError(e));
return null;
} else {
if (die) {
return die("_notify: " + shortError(e), e);
} else {
throw new IllegalStateException("_notify: "+shortError(e), e);
}
}
}


+ 1
- 1
utils/cobbzilla-utils

@@ -1 +1 @@
Subproject commit 009a52edb53315fcb7d90c9feed382970aa9a4b8
Subproject commit 1594a1ade170fd9b690682894f9b5f410659548e

Loading…
Откажи
Сачувај