Browse Source

add support for unavailable network location and retry in alternate location

tags/v0.14.1
Jonathan Cobb 4 years ago
parent
commit
eed026301b
25 changed files with 446 additions and 231 deletions
  1. +11
    -5
      bubble-server/src/main/java/bubble/cloud/CloudAndRegion.java
  2. +4
    -3
      bubble-server/src/main/java/bubble/cloud/CloudRegion.java
  3. +14
    -0
      bubble-server/src/main/java/bubble/cloud/compute/UnavailableComputeLocationException.java
  4. +45
    -38
      bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java
  5. +4
    -0
      bubble-server/src/main/java/bubble/model/cloud/BubbleNetworkState.java
  6. +5
    -1
      bubble-server/src/main/java/bubble/model/cloud/BubbleNode.java
  7. +6
    -2
      bubble-server/src/main/java/bubble/model/cloud/NetLocation.java
  8. +15
    -0
      bubble-server/src/main/java/bubble/model/cloud/RegionalServiceDriver.java
  9. +14
    -2
      bubble-server/src/main/java/bubble/notify/NewNodeNotification.java
  10. +13
    -36
      bubble-server/src/main/java/bubble/notify/NotificationHandler_hello_from_sage.java
  11. +11
    -7
      bubble-server/src/main/java/bubble/resources/cloud/NetworkActionsResource.java
  12. +47
    -10
      bubble-server/src/main/java/bubble/service/cloud/GeoService.java
  13. +17
    -1
      bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java
  14. +10
    -6
      bubble-server/src/main/java/bubble/service/cloud/NodeLaunchMonitor.java
  15. +19
    -2
      bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java
  16. +18
    -10
      bubble-server/src/main/java/bubble/service/cloud/NodeProgressMeter.java
  17. +11
    -6
      bubble-server/src/main/java/bubble/service/cloud/NodeProgressMeterConstants.java
  18. +137
    -73
      bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java
  19. +4
    -1
      bubble-server/src/main/java/bubble/service/cloud/job/NodeJobException.java
  20. +18
    -11
      bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java
  21. +1
    -0
      bubble-server/src/main/resources/logback.xml
  22. +12
    -7
      bubble-server/src/main/resources/message_templates/en_US/server/post_auth/ResourceMessages.properties
  23. +8
    -8
      bubble-server/src/test/java/bubble/mock/MockNetworkService.java
  24. +1
    -1
      bubble-web
  25. +1
    -1
      utils/cobbzilla-utils

+ 11
- 5
bubble-server/src/main/java/bubble/cloud/CloudAndRegion.java View File

@@ -5,16 +5,22 @@
package bubble.cloud;

import bubble.model.cloud.CloudService;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.*;
import lombok.experimental.Accessors;

@NoArgsConstructor @AllArgsConstructor @Accessors(chain=true)
@NoArgsConstructor @AllArgsConstructor @Accessors(chain=true) @EqualsAndHashCode
public class CloudAndRegion {

@Getter @Setter private CloudService cloud;
@Getter @Setter private CloudRegion region;

public CloudAndRegion (String cloudUuid, String regionInternalName) {
final CloudService c = new CloudService();
c.setUuid(cloudUuid);
setCloud(c);
final CloudRegion r = new CloudRegion();
r.setInternalName(regionInternalName);
setRegion(r);
}

}

+ 4
- 3
bubble-server/src/main/java/bubble/cloud/CloudRegion.java View File

@@ -5,6 +5,7 @@
package bubble.cloud;

import bubble.cloud.geoLocation.GeoLocation;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
@@ -12,7 +13,9 @@ import lombok.experimental.Accessors;

import static java.util.UUID.randomUUID;

@Accessors(chain=true) @ToString(of={"cloud","name","internalName"})
@Accessors(chain=true)
@EqualsAndHashCode(of={"cloud", "internalName"})
@ToString(of={"cloud", "name", "internalName"})
public class CloudRegion {

public static final CloudRegion[] EMPTY_REGIONS = new CloudRegion[0];
@@ -30,6 +33,4 @@ public class CloudRegion {

@Getter @Setter private GeoLocation location;

@Getter @Setter private Double costFactor = 1.0d;

}

+ 14
- 0
bubble-server/src/main/java/bubble/cloud/compute/UnavailableComputeLocationException.java View File

@@ -0,0 +1,14 @@
package bubble.cloud.compute;

import bubble.model.cloud.BubbleNode;
import lombok.Getter;

public class UnavailableComputeLocationException extends RuntimeException {

@Getter private final BubbleNode node;

public UnavailableComputeLocationException(BubbleNode node, String message) {
super(message);
this.node = node;
}
}

+ 45
- 38
bubble-server/src/main/java/bubble/cloud/compute/vultr/VultrDriver.java View File

@@ -66,10 +66,15 @@ public class VultrDriver extends ComputeServiceDriverBase {
public static final String VULTR_STATE_OK = "ok";
public static final String VULTR_STATE_LOCKED = "locked";

public static final String VULTR_POWER_STATUS = "power_status";
public static final String VULTR_POWER_STOPPED = "stopped";
public static final String VULTR_POWER_RUNNING = "running";

public static final String CREATE_SERVER_URL = VULTR_API_BASE + "server/create";
public static final String DESTROY_SERVER_URL = VULTR_API_BASE + "server/destroy";
public static final String LIST_SERVERS_URL = VULTR_API_BASE + "server/list";
public static final String POLL_SERVER_URL = LIST_SERVERS_URL + "?" + VULTR_SUBID + "=";
public static final String PLAN_NOT_AVAILABLE = "Plan is not available in the selected datacenter.";

@Getter(lazy=true) private final List<CloudRegion> cloudRegions = loadCloudResources(REGIONS_URL, new VultrRegionParser());
@Getter(lazy=true) private final List<ComputeNodeSize> cloudSizes = loadCloudResources(PLANS_URL, new VultrComputeNodeSizeParser());
@@ -121,6 +126,12 @@ public class VultrDriver extends ComputeServiceDriverBase {
final CloudRegion region = config.getRegion(node.getRegion());
final ComputeNodeSize size = config.getSize(node.getSize());

// for testing
// if (region.getInternalName().equals("Atlanta") || region.getInternalName().equals("Seoul")) {
// log.error("start: throwing UnavailableComputeLocationException on purpose");
// throw new UnavailableComputeLocationException(node, PLAN_NOT_AVAILABLE);
// }

final Long regionId = getRegion(region.getInternalName()).getId();
if (regionId == null) return die("start: region not found: "+region.getInternalName());

@@ -143,9 +154,13 @@ public class VultrDriver extends ComputeServiceDriverBase {
if (log.isInfoEnabled()) log.info("start: calling Vultr to start node: "+node.id());
final HttpResponseBean serverResponse = serverRequest.curl(); // fixme: we can do better than shelling to curl
if (serverResponse.getStatus() != 200) return die("start: error creating server: " + serverResponse);
final String entityString = serverResponse.getEntityString();
if (entityString.contains(PLAN_NOT_AVAILABLE)) {
throw new UnavailableComputeLocationException(node, entityString);
}
final JsonNode responseJson;
try {
responseJson = json(serverResponse.getEntityString(), JsonNode.class);
responseJson = json(entityString, JsonNode.class);
} catch (IllegalStateException e) {
return die("start: error creating server (error parsing response as JSON): " + serverResponse);
}
@@ -157,7 +172,7 @@ public class VultrDriver extends ComputeServiceDriverBase {
// nodeDAO.update(node);

final long start = now();
boolean startedOk = false;
int startedOk = 0;
final HttpRequestBean poll = auth(new HttpRequestBean(POLL_SERVER_URL+subId));
sleep(SERVER_START_INITIAL_INTERVAL);
while (now() - start < SERVER_START_TIMEOUT) {
@@ -167,41 +182,49 @@ public class VultrDriver extends ComputeServiceDriverBase {
return die("start: error polling node "+node.id()+" subid: "+subId+": "+pollResponse);
}
final JsonNode serverNode = json(pollResponse.getEntityString(), JsonNode.class);
if (log.isDebugEnabled()) log.debug("start: polled node "+node.id()+" json="+json(serverNode, COMPACT_MAPPER));
// if (log.isDebugEnabled()) log.debug("start: polled node "+node.id()+" json="+json(serverNode, COMPACT_MAPPER));
if (serverNode != null) {
if (serverNode.has("tag")
&& serverNode.get("tag").textValue().equals(cloud.getUuid())
&& serverNode.has(VULTR_STATUS)
&& serverNode.has(VULTR_SERVER_STATE)
&& serverNode.has(VULTR_POWER_STATUS)
&& serverNode.has(VULTR_V4_IP)) {

final String status = serverNode.get(VULTR_STATUS).textValue();
final String serverState = serverNode.get(VULTR_SERVER_STATE).textValue();
final String powerStatus = serverNode.get(VULTR_POWER_STATUS).textValue();
final String ip4 = serverNode.get(VULTR_V4_IP).textValue();
final String ip6 = serverNode.get(VULTR_V6_IP).textValue();
// if (log.isInfoEnabled()) log.info("start: server_state="+serverState+", status="+status, "ip4="+ip4+", ip6="+ip6);
if (log.isDebugEnabled()) log.debug("start("+node.id()+"): found server_state="+serverState+", status="+status+", power_status="+powerStatus+", ip4="+ip4+", ip6="+ip6);

if (ip4 != null && ip4.length() > 0 && !ip4.equals("0.0.0.0")) {
node.setIp4(ip4);
// nodeDAO.update(node);
}
if (ip6 != null && ip6.length() > 0) {
node.setIp6(ip6);
// nodeDAO.update(node);
}
if (status.equals(VULTR_STATUS_ACTIVE) && (node.hasIp4() || node.hasIp6())) {
node.setState(BubbleNodeState.booted);
// nodeDAO.update(node);
}
if (serverState.equals(VULTR_STATE_OK)) {
if (log.isInfoEnabled()) log.info("start: server is ready: "+node.id());
startedOk = true;
break;
if (serverState.equals(VULTR_STATE_OK) && powerStatus.equals(VULTR_POWER_RUNNING)) {
startedOk++;
if (startedOk >= 3) {
if (log.isDebugEnabled()) log.debug("start("+node.id()+"): STARTED(startedOk="+startedOk+"): server_state="+serverState+", status="+status+", power_status="+powerStatus+", ip4="+ip4+", ip6="+ip6);
break;
}
if (log.isDebugEnabled()) log.debug("start("+node.id()+"): good news: startedOk="+startedOk+", server_state="+serverState+", status="+status+", power_status="+powerStatus+", ip4="+ip4+", ip6="+ip6);
} else {
startedOk = 0;
if (log.isDebugEnabled()) log.debug("start("+node.id()+"): bad news 1: reset startedOk="+startedOk+", server_state="+serverState+", status="+status+", power_status="+powerStatus+", ip4="+ip4+", ip6="+ip6);
}
} else {
startedOk = 0;
if (log.isDebugEnabled()) log.debug("start("+node.id()+"): bad news 2: reset startedOk="+startedOk+", server_state="+serverState+", status="+status+", power_status="+powerStatus+", ip4="+ip4+", ip6="+ip6);
}
}
}
}
if (!startedOk) {
if (startedOk < 3) {
if (log.isErrorEnabled()) log.error("start: timeout waiting for node "+node.id()+" to boot and become available, stopping it");
stop(node);
}
@@ -223,19 +246,19 @@ public class VultrDriver extends ComputeServiceDriverBase {
try {
_stop(node);
} catch (EntityNotFoundException e) {
if (log.isInfoEnabled()) log.info("stop: node stopped");
if (log.isInfoEnabled()) log.info("stop("+node.id()+"): node stopped");
return node;

} catch (Exception e) {
if (log.isInfoEnabled()) log.info("stop: _stop failed with: "+shortError(e));
if (log.isInfoEnabled()) log.info("stop("+node.id()+"): _stop failed with: "+shortError(e));
lastEx = e;
}
sleep(SERVER_STOP_CHECK_INTERVAL, "stop: waiting to try stopping again until node is not found");
if (log.isWarnEnabled()) log.warn("stop: node still running: "+node.id());
sleep(SERVER_STOP_CHECK_INTERVAL, "stop("+node.id()+"): waiting to try stopping again until node is not found");
if (log.isWarnEnabled()) log.warn("stop("+node.id()+"): node still running: "+node.id());
}
if (log.isErrorEnabled()) log.error("stop: error stopping node: "+node.id());
if (log.isErrorEnabled()) log.error("stop("+node.id()+"): error stopping node: "+node.id());
if (lastEx != null) throw lastEx;
return die("stop: timeout stopping node: "+node.id());
return die("stop("+node.id()+"): timeout stopping node: "+node.id());
}

public BubbleNode _stop(BubbleNode node) throws IOException {
@@ -247,11 +270,11 @@ public class VultrDriver extends ComputeServiceDriverBase {

final String subId = vultrNode.getTag(TAG_INSTANCE_ID);
if (subId == null) {
if (log.isErrorEnabled()) log.error("_stop: node "+node.id()+" is missing tag "+TAG_INSTANCE_ID+", cannot stop, throwing invalidEx");
throw invalidEx("err.node.stop.error", "stop: no " + VULTR_SUBID + " on node, returning");
if (log.isErrorEnabled()) log.error("_stop("+node.id()+"): node "+node.id()+" is missing tag "+TAG_INSTANCE_ID+", cannot stop, throwing invalidEx");
throw invalidEx("err.node.stop.error", "stop("+node.id()+"): no " + VULTR_SUBID + " on node, returning");
}

if (log.isInfoEnabled()) log.info("_stop: calling stopServer("+subId+") for node "+node.id());
if (log.isInfoEnabled()) log.info("_stop("+node.id()+"): calling stopServer("+subId+") for node "+node.id());
stopServer(subId);
return node;
}
@@ -264,22 +287,6 @@ public class VultrDriver extends ComputeServiceDriverBase {
}
}

private BubbleNode findByIp4(BubbleNode node, String ip4) throws IOException {
final BubbleNode found = listNodes().stream()
.filter(n -> n.hasIp4() && n.getIp4().equals(ip4))
.findFirst()
.orElse(null);
if (found == null) {
if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +") and no server had this ip4");
return null;
}
if (!found.hasTag(TAG_INSTANCE_ID)) {
if (log.isWarnEnabled()) log.warn("stop: no subid tag found on node ("+node.getFqdn()+"/"+ ip4 +"), cannot stop");
return null;
}
return found;
}

public BubbleNode listNode(BubbleNode node) {
final HttpRequestBean listServerRequest = auth(new HttpRequestBean(POLL_SERVER_URL+node.getTag(TAG_INSTANCE_ID)));
final HttpResponseBean listResponse = listServerRequest.curl();


+ 4
- 0
bubble-server/src/main/java/bubble/model/cloud/BubbleNetworkState.java View File

@@ -18,4 +18,8 @@ public enum BubbleNetworkState {

public boolean canStop() { return this != stopped && this != error_stopping; }

public boolean isStopped() {
return this == BubbleNetworkState.stopped || this == BubbleNetworkState.error_stopping;
}

}

+ 5
- 1
bubble-server/src/main/java/bubble/model/cloud/BubbleNode.java View File

@@ -247,11 +247,15 @@ public class BubbleNode extends IdentifiableBase implements HasNetwork, HasBubbl
return new BubbleNodeQuickClient(this, configuration);
}

@JsonIgnore @Transient @Getter @Setter private volatile RuntimeException launchException;
public boolean hasLaunchException () { return launchException != null; }

public void waitForIpAddresses() throws TimeoutException {
final long start = now();
while ((!hasIp4() || !hasIp6()) && now() - start < IP_ADDR_TIMEOUT) {
while ((!hasIp4() || !hasIp6()) && !hasLaunchException() && now() - start < IP_ADDR_TIMEOUT) {
sleep(TimeUnit.SECONDS.toMillis(2), "waiting for node to have IP addresses");
}
if (hasLaunchException()) throw launchException;
if (!hasIp4() || !hasIp6()) throw new TimeoutException("waitForIpAddresses: timeout");
}



+ 6
- 2
bubble-server/src/main/java/bubble/model/cloud/NetLocation.java View File

@@ -11,6 +11,7 @@ import lombok.experimental.Accessors;

import java.io.Serializable;

import static org.cobbzilla.util.daemon.ZillaRuntime.bool;
import static org.cobbzilla.util.daemon.ZillaRuntime.empty;

@NoArgsConstructor @Accessors(chain=true)
@@ -25,8 +26,11 @@ public class NetLocation implements Serializable {
@Getter @Setter private String region;
public boolean hasRegion () { return !empty(region); }

public static NetLocation fromCloudAndRegion(String cloud, String region) {
return new NetLocation().setCloud(cloud).setRegion(region);
@Getter @Setter private Boolean exactRegion;
public boolean exactRegion () { return bool(exactRegion); }

public static NetLocation fromCloudAndRegion(String cloud, String region, Boolean exactRegion) {
return new NetLocation().setCloud(cloud).setRegion(region).setExactRegion(exactRegion);
}

public static NetLocation fromIp(String ip) {


+ 15
- 0
bubble-server/src/main/java/bubble/model/cloud/RegionalServiceDriver.java View File

@@ -4,6 +4,7 @@
*/
package bubble.model.cloud;

import bubble.cloud.CloudAndRegion;
import bubble.cloud.CloudRegion;
import bubble.cloud.CloudRegionRelative;
import bubble.server.BubbleConfiguration;
@@ -11,6 +12,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;

@@ -26,6 +28,15 @@ public interface RegionalServiceDriver {
BubbleFootprint footprint,
double latitude,
double longitude) {
return findClosestRegions(configuration, clouds, footprint, latitude, longitude, null);
}

static List<CloudRegionRelative> findClosestRegions(BubbleConfiguration configuration,
List<CloudService> clouds,
BubbleFootprint footprint,
double latitude,
double longitude,
Collection<CloudAndRegion> exclude) {

final List<CloudRegionRelative> allRegions = new ArrayList<>();
for (CloudService c : clouds) {
@@ -41,6 +52,10 @@ public interface RegionalServiceDriver {
if (footprint != null && !footprint.isAllowedCountry(region.getLocation().getCountry())) {
continue;
}
if (exclude != null && exclude.contains(new CloudAndRegion(c, region))) {
log.info("findClosestRegions: skipping excluded region: "+region);
continue;
}
final CloudRegionRelative r = new CloudRegionRelative(region);
r.setCloud(c.getUuid());
r.setDistance(latitude, longitude);


+ 14
- 2
bubble-server/src/main/java/bubble/notify/NewNodeNotification.java View File

@@ -4,8 +4,11 @@
*/
package bubble.notify;

import bubble.cloud.CloudAndRegion;
import bubble.model.account.AccountContact;
import bubble.model.cloud.BubbleNetwork;
import bubble.model.cloud.BubbleNode;
import bubble.model.cloud.NetLocation;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Getter;
import lombok.NoArgsConstructor;
@@ -13,6 +16,7 @@ import lombok.Setter;
import lombok.experimental.Accessors;

import javax.persistence.Transient;
import java.util.ArrayList;
import java.util.List;

import static bubble.ApiConstants.newNodeHostname;
@@ -31,8 +35,16 @@ public class NewNodeNotification {
@Getter @Setter @JsonIgnore private BubbleNetwork networkObject;
@Getter @Setter private String domain;
@Getter @Setter private String fqdn;
@Getter @Setter private String cloud;
@Getter @Setter private String region;
@Getter @Setter private NetLocation netLocation;

@Transient @JsonIgnore @Getter @Setter private List<CloudAndRegion> excludeRegions;
public NewNodeNotification excludeRegion (String cloudUuid, String regionInternalName) {
if (excludeRegions == null) excludeRegions = new ArrayList<>();
excludeRegions.add(new CloudAndRegion(cloudUuid, regionInternalName));
return this;
}
public NewNodeNotification excludeCurrentRegion (BubbleNode node) { return excludeRegion(node.getCloud(), node.getRegion()); }

@Getter @Setter private Boolean automated;
public boolean automated () { return automated != null && automated; }



+ 13
- 36
bubble-server/src/main/java/bubble/notify/NotificationHandler_hello_from_sage.java View File

@@ -4,7 +4,6 @@
*/
package bubble.notify;

import bubble.cloud.CloudRegionRelative;
import bubble.dao.bill.AccountPlanDAO;
import bubble.dao.bill.BubblePlanDAO;
import bubble.dao.cloud.BubbleNetworkDAO;
@@ -14,12 +13,12 @@ import bubble.model.bill.AccountPlan;
import bubble.model.bill.BubblePlan;
import bubble.model.cloud.BubbleNetwork;
import bubble.model.cloud.BubbleNode;
import bubble.model.cloud.CloudService;
import bubble.model.cloud.NetLocation;
import bubble.model.cloud.notify.ReceivedNotification;
import bubble.service.boot.StandardSelfNodeService;
import bubble.service.notify.NotificationService;
import bubble.service.cloud.GeoService;
import bubble.service.cloud.StandardNetworkService;
import bubble.service.notify.NotificationService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;

@@ -99,39 +98,17 @@ public class NotificationHandler_hello_from_sage extends ReceivedNotificationHan
log.warn("hello_from_sage: No sage node found: "+selfNodeService.getSageNode());
} else {
final BubbleNetwork network = networkDAO.findByUuid(thisNode.getNetwork());
final List<CloudRegionRelative> closestRegions = geoService.getCloudRegionRelatives(network, thisNode.getIp4());
if (closestRegions.isEmpty()) {
log.warn("hello_from_sage: no regions found");
} else {
// find the closest region that is not our current region
CloudRegionRelative closestNotUs = null;
for (CloudRegionRelative r : closestRegions) {
if (r.getInternalName().equalsIgnoreCase(thisNode.getRegion()) || r.getName().equalsIgnoreCase(thisNode.getRegion())) {
continue;
}
closestNotUs = r;
break;
}
if (closestNotUs == null) {
// there is only one region
closestNotUs = closestRegions.get(0);
}
final CloudService cloud = cloudDAO.findByUuid(closestNotUs.getCloud());
if (cloud == null) {
log.warn("hello_from_sage: cloud not found, cannot request new node: "+closestNotUs.getCloud());
} else {
final NewNodeNotification newNodeRequest = new NewNodeNotification()
.setAccount(network.getAccount())
.setNetwork(network.getUuid())
.setNetworkName(network.getName())
.setDomain(network.getDomain())
.setCloud(cloud.getUuid())
.setRegion(closestNotUs.getInternalName())
.setAutomated(true);
log.info("hello_from_sage: requesting new node : " + json(newNodeRequest));
networkService.backgroundNewNode(newNodeRequest);
}
}
// find the closest region that is not our current region
final NewNodeNotification newNodeRequest = new NewNodeNotification()
.setAccount(network.getAccount())
.setNetwork(network.getUuid())
.setNetworkName(network.getName())
.setDomain(network.getDomain())
.setNetLocation(NetLocation.fromCloudAndRegion(thisNode.getCloud(), thisNode.getRegion(), false))
.excludeCurrentRegion(thisNode)
.setAutomated(true);
log.info("hello_from_sage: requesting new node : " + json(newNodeRequest));
networkService.backgroundNewNode(newNodeRequest);
}
}
}


+ 11
- 7
bubble-server/src/main/java/bubble/resources/cloud/NetworkActionsResource.java View File

@@ -72,7 +72,8 @@ public class NetworkActionsResource {
public Response startNetwork(@Context Request req,
@Context ContainerRequest ctx,
@QueryParam("cloud") String cloud,
@QueryParam("region") String region) {
@QueryParam("region") String region,
@QueryParam("exactRegion") Boolean exactRegion) {

final Account caller = userPrincipal(ctx);
if (!authAccount(caller)) return forbidden();
@@ -88,7 +89,7 @@ public class NetworkActionsResource {
if (!network.getState().canStart()) return invalid("err.network.cannotStartInCurrentState");
authenticatorService.ensureAuthenticated(ctx, ActionTarget.network);

return _startNetwork(network, cloud, region, req);
return _startNetwork(network, cloud, region, exactRegion, req);
}

@GET @Path(EP_STATUS)
@@ -173,13 +174,14 @@ public class NetworkActionsResource {
public Response restoreNetwork(@Context Request req,
@Context ContainerRequest ctx,
@QueryParam("cloud") String cloud,
@QueryParam("region") String region) {
@QueryParam("region") String region,
@QueryParam("exactRegion") Boolean exactRegion) {
final Account caller = userPrincipal(ctx);
if (!authAccount(caller)) return forbidden();

authenticatorService.ensureAuthenticated(ctx, ActionTarget.network);

return ok(networkService.restoreNetwork(network, cloud, region, req));
return ok(networkService.restoreNetwork(network, cloud, region, exactRegion, req));
}

@PUT @Path(EP_FORK +"/{fqdn}")
@@ -187,7 +189,8 @@ public class NetworkActionsResource {
@Context ContainerRequest ctx,
@PathParam("fqdn") String fqdn,
@QueryParam("cloud") String cloud,
@QueryParam("region") String region) {
@QueryParam("region") String region,
@QueryParam("exactRegion") Boolean exactRegion) {
final Account caller = userPrincipal(ctx);
if (!caller.admin()) return forbidden();

@@ -221,19 +224,20 @@ public class NetworkActionsResource {

network.setForkHost(network.hostFromFqdn(fqdn));

return _startNetwork(network, cloud, region, req);
return _startNetwork(network, cloud, region, exactRegion, req);
}

public Response _startNetwork(BubbleNetwork network,
String cloud,
String region,
Boolean exactRegion,
Request req) {
if ((region != null && cloud == null) || (region == null && cloud != null)) {
throw invalidEx("err.netlocation.invalid", "must specify both cloud and region, or neither");
}

final NetLocation netLocation = (region != null)
? NetLocation.fromCloudAndRegion(cloud, region)
? NetLocation.fromCloudAndRegion(cloud, region, exactRegion)
: NetLocation.fromIp(getRemoteHost(req));

return ok(networkService.startNetwork(network, netLocation));


+ 47
- 10
bubble-server/src/main/java/bubble/service/cloud/GeoService.java View File

@@ -30,10 +30,7 @@ import org.cobbzilla.wizard.validation.SimpleViolationException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

@@ -173,7 +170,7 @@ public class GeoService {
}

@Getter(lazy=true) private final RedisService timezoneRedis = redis.prefixNamespace(getClass().getName()+".timezone");
private Map<String, GeoTimeZone> timezoneCache = new ExpirationMap<>(MEMORY_CACHE_TIME);
private final Map<String, GeoTimeZone> timezoneCache = new ExpirationMap<>(MEMORY_CACHE_TIME);

public GeoTimeZone getTimeZone (final Account account, String ip) {
final AtomicReference<Account> acct = new AtomicReference<>(account);
@@ -211,10 +208,23 @@ public class GeoService {
}

public List<CloudRegionRelative> getCloudRegionRelatives(BubbleNetwork network, String userIp) {
return getCloudRegionRelatives(network, userIp, null);
}

public List<CloudRegionRelative> getCloudRegionRelatives(BubbleNetwork network,
String userIp,
Collection<CloudAndRegion> exclude) {
final GeoLocation geo = locate(network.getAccount(), userIp);
final double latitude = geo.getLatitude();
final double longitude = geo.getLongitude();
return getCloudRegionRelatives(network, latitude, longitude, exclude);

}

public List<CloudRegionRelative> getCloudRegionRelatives(BubbleNetwork network,
double latitude,
double longitude,
Collection<CloudAndRegion> exclude) {
// do we have a footprint?
BubbleFootprint footprint = null;
if (network.hasFootprint()) {
@@ -224,17 +234,23 @@ public class GeoService {

// find all cloud services available to us
final List<CloudService> clouds = cloudDAO.findByAccountAndType(network.getAccount(), CloudServiceType.compute);
final List<CloudRegionRelative> closestRegions = findClosestRegions(configuration, clouds, footprint, latitude, longitude);
final List<CloudRegionRelative> closestRegions = findClosestRegions(configuration, clouds, footprint, latitude, longitude, exclude);
if (closestRegions.isEmpty()) throw invalidEx("err.cloudRegions.required");
return closestRegions;
}

public CloudAndRegion selectCloudAndRegion(BubbleNetwork network, NetLocation netLocation) {
return selectCloudAndRegion(network, netLocation, Collections.emptyList());
}

public CloudAndRegion selectCloudAndRegion(BubbleNetwork network,
NetLocation netLocation,
Collection<CloudAndRegion> exclude) {
final CloudRegion closest;
final String cloudUuid;
if (netLocation.hasIp()) {
// determine closest POP to userIp from cloud compute service
final List<CloudRegionRelative> closestRegions = getCloudRegionRelatives(network, netLocation.getIp());
final List<CloudRegionRelative> closestRegions = getCloudRegionRelatives(network, netLocation.getIp(), exclude);
closest = closestRegions.get(0);
cloudUuid = closest.getCloud();
final CloudService cloud = cloudDAO.findByUuid(cloudUuid);
@@ -248,9 +264,30 @@ public class GeoService {
log.error("selectCloudAndRegion (network="+network.getUuid()+"): netLocation.cloud="+netLocation.getCloud()+" not found under account="+network.getAccount());
throw notFoundEx(netLocation.getCloud());
}
closest = cloud.getComputeDriver(configuration).getRegion(netLocation.getRegion());
if (closest == null) throw notFoundEx(netLocation.getRegion());
return new CloudAndRegion(cloud, closest);
final CloudRegion desiredRegion = cloud.getComputeDriver(configuration).getRegion(netLocation.getRegion());
if (desiredRegion == null) throw notFoundEx(netLocation.getRegion());

final GeoLocation desiredLocation = desiredRegion.getLocation();
final List<CloudRegionRelative> candidateRegions = getCloudRegionRelatives(network, desiredLocation.getLatitude(), desiredLocation.getLongitude(), exclude);

if (candidateRegions.isEmpty()) return die("selectCloudAndRegion: no candidate regions found, desiredRegion="+desiredRegion);

// If multiple candidate regions have distance zero, choose the one that is in the same cloud
final CloudRegionRelative preciseRegion = candidateRegions.stream()
.filter(r -> r.getDistance() == 0)
.filter(r -> r.getCloud().equals(netLocation.getCloud()))
.filter(r -> !netLocation.exactRegion() || r.getInternalName().equals(netLocation.getRegion()))
.findFirst()
.orElse(null);

if (preciseRegion != null) return new CloudAndRegion(cloud, preciseRegion);
if (netLocation.exactRegion()) {
log.error("selectCloudAndRegion: exactRegion was set, and region was not a candidate: "+netLocation.getRegion());
throw notFoundEx(netLocation.getRegion());
}

// No precise region available, use the closest
return new CloudAndRegion(cloud, candidateRegions.get(0));

} else {
return die("selectCloudAndRegion: no IP or region provided to launch first node");


+ 17
- 1
bubble-server/src/main/java/bubble/service/cloud/NodeLaunchException.java View File

@@ -6,10 +6,14 @@ package bubble.service.cloud;

import bubble.model.cloud.BubbleNode;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;

import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;

@Slf4j
public class NodeLaunchException extends RuntimeException {

public enum NodeLaunchExceptionType { fatal, canRetry, interrupted }
public enum NodeLaunchExceptionType { fatal, canRetry, interrupted, unavailableRegion }

@Getter private final BubbleNode node;
public boolean hasNode () { return node != null; }
@@ -21,6 +25,11 @@ public class NodeLaunchException extends RuntimeException {
super(message, e);
this.node = node;
this.type = type;
if (log.isTraceEnabled()) log.trace("NodeLaunchException created: "+this);
}

@Override public String toString() {
return "{NodeLaunchException type="+type+", message="+getMessage()+", cause="+(getCause() == null ? "null" : shortError(getCause()))+"}";
}

private NodeLaunchException (BubbleNode node, Exception e, NodeLaunchExceptionType type) {
@@ -64,4 +73,11 @@ public class NodeLaunchException extends RuntimeException {
public static <T> T launchInterrupted (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.interrupted); }
public static <T> T launchInterrupted (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.interrupted); }

public static <T> T unavailableRegion (String message) { throw new NodeLaunchException(message, NodeLaunchExceptionType.unavailableRegion); }
public static <T> T unavailableRegion (Exception e, String message) { throw new NodeLaunchException(e, message, NodeLaunchExceptionType.unavailableRegion); }
public static <T> T unavailableRegion (Exception e) { throw new NodeLaunchException(e, NodeLaunchExceptionType.unavailableRegion); }
public static <T> T unavailableRegion (BubbleNode node, String message) { throw new NodeLaunchException(node, message, NodeLaunchExceptionType.unavailableRegion); }
public static <T> T unavailableRegion (BubbleNode node, Exception e) { throw new NodeLaunchException(node, e, NodeLaunchExceptionType.unavailableRegion); }
public static <T> T unavailableRegion (BubbleNode node, Exception e, String message) { throw new NodeLaunchException(node, e, message, NodeLaunchExceptionType.unavailableRegion); }

}

+ 10
- 6
bubble-server/src/main/java/bubble/service/cloud/NodeLaunchMonitor.java View File

@@ -31,6 +31,7 @@ import static bubble.service.cloud.NodeProgressMeter.getProgressMeterPrefix;
import static java.util.concurrent.TimeUnit.*;
import static org.cobbzilla.util.daemon.ZillaRuntime.*;
import static org.cobbzilla.util.json.JsonUtil.json;
import static org.cobbzilla.util.reflect.ReflectionUtil.closeQuietly;

@Service @Slf4j
public class NodeLaunchMonitor extends SimpleDaemon {
@@ -54,7 +55,7 @@ public class NodeLaunchMonitor extends SimpleDaemon {
startIfNotRunning();
final LauncherEntry previousLaunch = launcherThreads.get(networkUuid);
if (previousLaunch != null && previousLaunch.isAlive()) {
log.warn("registerLauncher("+networkUuid+"): entry thread exists, stopping it: "+previousLaunch);
log.warn("register("+networkUuid+"): entry thread exists, stopping it: "+previousLaunch);
forceEndLauncher(previousLaunch);
}
launcherThreads.put(networkUuid, new LauncherEntry(nnUuid, networkUuid, t));
@@ -71,8 +72,7 @@ public class NodeLaunchMonitor extends SimpleDaemon {
}

public boolean isRegistered(String networkUuid) {
final LauncherEntry launcherEntry = launcherThreads.get(networkUuid);
return launcherEntry != null && launcherEntry.isAlive();
return launcherThreads.containsKey(networkUuid);
}

private synchronized void startIfNotRunning() {
@@ -102,8 +102,12 @@ public class NodeLaunchMonitor extends SimpleDaemon {
}

public void unregister(String networkUuid) {
log.info("unregister: removing network="+networkUuid);
launcherThreads.remove(networkUuid);
log.info("unregister: removing network="+networkUuid+" from "+stacktrace());
final LauncherEntry entry = launcherThreads.remove(networkUuid);
if (entry != null) {
final NodeProgressMeter progressMeter = progressMeters.get(entry.nnUuid);
if (progressMeter != null) closeQuietly(progressMeter);
}
}

@Override protected void process() {
@@ -119,8 +123,8 @@ public class NodeLaunchMonitor extends SimpleDaemon {
private void forceEndLauncher(LauncherEntry entry) {
final NodeProgressMeter meter = getProgressMeter(entry.getNnUuid());
if (meter != null) meter.cancel();
unregister(entry.getNetworkUuid());
terminate(entry.getThread(), LAUNCH_TERMINATE_TIMEOUT);
launcherThreads.remove(entry.getNetworkUuid());
}

private final Map<String, NodeProgressMeter> progressMeters = new ExpirationMap<>(50, HOURS.toMillis(1), ExpirationEvictionPolicy.atime);


+ 19
- 2
bubble-server/src/main/java/bubble/service/cloud/NodeLauncher.java View File

@@ -69,16 +69,33 @@ public class NodeLauncher implements Runnable {
final NodeLaunchException launchException = (NodeLaunchException) exception;
switch (launchException.getType()) {
case fatal:
die("NodeLauncher.run: fatal launch exception: " + shortError(launchException));
die("NodeLauncher.run: fatal launch exception: " + shortError(launchException), launchException);
break;

case interrupted:
log.warn("NodeLauncher.run: launch interrupted, exiting early");
return;

case canRetry:
log.warn("NodeLauncher.run: nonfatal launch exception for node " + launchException.nodeSummary() + " : " + shortError(launchException));
break;

case unavailableRegion:
log.warn("NodeLauncher.run: unavailableRegion for node " + launchException.nodeSummary() + " : " + shortError(launchException));
if (newNodeRequest.getNetLocation().exactRegion()) {
die("NodeLauncher.run: unavailableRegion and exactRegion set, cannot launch");
} else {
if (launchException.getNode() != null) {
log.warn("NodeLauncher.run: unavailableRegion and exactRegion not set, trying another region");
newNodeRequest.excludeCurrentRegion(launchException.getNode());
} else {
die("NodeLauncher.run: unavailableRegion but node was null!");
}
}
break;

default:
die("NodeLauncher.run: unknown launch exception (type="+launchException.getType()+"): "+shortError(launchException));
die("NodeLauncher.run: unknown launch exception (type="+launchException.getType()+"): "+shortError(launchException), launchException);
}
} else {
die("NodeLauncher.run: fatal launch exception: " + shortError(exception), exception);


+ 18
- 10
bubble-server/src/main/java/bubble/service/cloud/NodeProgressMeter.java View File

@@ -114,9 +114,7 @@ public class NodeProgressMeter extends PipedOutputStream implements Runnable {
log.warn("error("+line+") ignored, error already set");
return;
}
closed.set(true);
error.set(true);
background(this::close);
setCurrentTick(newTick(getErrorMessageKey(line), line));
}

@@ -128,11 +126,19 @@ public class NodeProgressMeter extends PipedOutputStream implements Runnable {
.setDetails(line);
}

public void reset() {
public void resetToPreAnsible() {
reset(standardTicks.size());
_setCurrentTick(lastStandardTick);
}

public void fullReset() {
reset(0);
}

private void reset(int tickPos) {
if (closed.get()) die("reset: progress meter is closed, cannot reset");
error.set(false);
tickPos = standardTicks.size();
_setCurrentTick(lastStandardTick);
this.tickPos = tickPos;
}

@Override public void run() {
@@ -167,10 +173,10 @@ public class NodeProgressMeter extends PipedOutputStream implements Runnable {
public void _setCurrentTick(NodeProgressMeterTick tick, boolean allowForce) {
final String json = json(tick, COMPACT_MAPPER);
if (!allowForce && closed.get()) {
log.warn("setCurrentTick (closed, not setting): "+json);
log.warn("_setCurrentTick (closed, not setting): "+json);
return;
}
log.info("setCurrentTick: "+json);
if (log.isTraceEnabled()) log.trace("_setCurrentTick: "+json+" from: "+stacktrace());
redis.set(getProgressMeterKey(key, nn.getAccount()), json, EX, TICK_REDIS_EXPIRATION);
}

@@ -180,6 +186,7 @@ public class NodeProgressMeter extends PipedOutputStream implements Runnable {
public static final long THREAD_KILL_TIMEOUT = SECONDS.toMillis(5);

@Override public void close() {
if (log.isTraceEnabled()) log.trace("close: called from: "+stacktrace());
closed.set(true);
try {
super.close();
@@ -192,13 +199,14 @@ public class NodeProgressMeter extends PipedOutputStream implements Runnable {
}

public void completed() {
if (log.isTraceEnabled()) log.trace("completed: called from: "+stacktrace());
closed.set(true);
success.set(true);
touch();
_setCurrentTick(new NodeProgressMeterTick()
.setNetwork(nn.getNetwork())
.setAccount(nn.getAccount())
.setMessageKey(METER_COMPLETED)
.setMessageKey(METER_COMPLETED_OK)
.setPercent(100));
background(this::close);
}
@@ -208,13 +216,13 @@ public class NodeProgressMeter extends PipedOutputStream implements Runnable {
}

public void cancel () {
log.info("cancel: cancelling progress meter for network: "+nn.getNetworkName());
if (log.isTraceEnabled()) log.trace("cancel: cancelling progress meter for network: "+nn.getNetworkName()+" from "+stacktrace());
closed.set(true);
success.set(true);
_setCurrentTick(new NodeProgressMeterTick()
.setNetwork(nn.getNetwork())
.setAccount(nn.getAccount())
.setMessageKey(METER_CANCELED)
.setMessageKey(METER_ERROR_CANCELED)
.setPercent(0));
background(this::close);
}


+ 11
- 6
bubble-server/src/main/java/bubble/service/cloud/NodeProgressMeterConstants.java View File

@@ -5,6 +5,7 @@
package bubble.service.cloud;

import bubble.notify.NewNodeNotification;
import lombok.extern.slf4j.Slf4j;
import org.cobbzilla.util.collection.MapBuilder;

import java.lang.reflect.Field;
@@ -22,6 +23,7 @@ import static org.cobbzilla.util.json.JsonUtil.json;
import static org.cobbzilla.util.reflect.ReflectionUtil.constValue;
import static org.cobbzilla.util.reflect.ReflectionUtil.isStaticFinalString;

@Slf4j
public class NodeProgressMeterConstants {

public static final String TICK_PREFIX = "METER_TICK_";
@@ -63,14 +65,15 @@ public class NodeProgressMeterConstants {
public static final String METER_ERROR_PEER_LIMIT_REACHED = "BUBBLE-ERROR: PEER LIMIT REACHED";
public static final String METER_ERROR_NODE_CLOUD_NOT_FOUND = "BUBBLE-ERROR: NODE CLOUD NOT FOUND";
public static final String METER_ERROR_STARTING_NODE = "BUBBLE-ERROR: ERROR STARTING NODE";
public static final String METER_ERROR_UNAVAILABLE_LOCATION = "BUBBLE-ERROR: UNAVAILABLE LOCATION";
public static final String METER_ERROR_RETRY_UNAVAILABLE_LOCATION = "BUBBLE-ERROR: UNAVAILABLE LOCATION (WILL RETRY)";
public static final String METER_ERROR_DNS = "BUBBLE-ERROR: ERROR SETTING DNS ENTRIES FOR NODE";
public static final String METER_ERROR_NO_IP = "BUBBLE-ERROR: NODE STARTED BUT HAS NO IP ADDRESS";
public static final String METER_ERROR_ROLE_VALIDATION_ERRORS = "BUBBLE-ERROR: ROLE VALIDATION FAILED";

public static final String METER_COMPLETED = "meter_completed";
public static final String METER_CANCELED = "meter_canceled";
public static final String METER_START_OR_DNS_ERROR = "meter_start_or_dns_error";
public static final String METER_UNKNOWN_ERROR = "meter_unknown_error";
public static final String METER_ERROR_START_OR_DNS = "meter_error_start_or_dns";
public static final String METER_ERROR_CANCELED = "meter_error_canceled";
public static final String METER_ERROR_UNKNOWN = "meter_error_unknown";
public static final String METER_COMPLETED_OK = "meter_completed_ok";

private static final Map<String, Integer> STANDARD_TICKS = MapBuilder.build(new Object[][] {
{METER_TICK_CONFIRMING_NETWORK_LOCK, 1},
@@ -134,7 +137,9 @@ public class NodeProgressMeterConstants {

public static String getErrorMessageKey (String error) {
final String messageKey = ERRORS.get(error);
return messageKey != null ? messageKey : METER_UNKNOWN_ERROR;
if (messageKey != null) return messageKey;
if (log.isWarnEnabled()) log.warn("getErrorMessageKey("+error+"): no found, returning METER_ERROR_UNKNOWN ("+METER_ERROR_UNKNOWN+")");
return METER_ERROR_UNKNOWN;
}




+ 137
- 73
bubble-server/src/main/java/bubble/service/cloud/StandardNetworkService.java View File

@@ -6,6 +6,7 @@ package bubble.service.cloud;

import bubble.cloud.CloudAndRegion;
import bubble.cloud.compute.ComputeServiceDriver;
import bubble.cloud.compute.UnavailableComputeLocationException;
import bubble.dao.account.AccountDAO;
import bubble.dao.account.AccountPolicyDAO;
import bubble.dao.account.AccountSshKeyDAO;
@@ -63,6 +64,7 @@ import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
@@ -87,7 +89,6 @@ import static org.cobbzilla.util.daemon.ZillaRuntime.*;
import static org.cobbzilla.util.io.FileUtil.*;
import static org.cobbzilla.util.io.StreamUtil.stream2string;
import static org.cobbzilla.util.json.JsonUtil.json;
import static org.cobbzilla.util.reflect.ReflectionUtil.closeQuietly;
import static org.cobbzilla.util.system.CommandShell.chmod;
import static org.cobbzilla.util.system.Sleep.sleep;
import static org.cobbzilla.util.time.TimeUtil.formatDuration;
@@ -152,6 +153,7 @@ public class StandardNetworkService implements NetworkService {
boolean killNode = false;
try {
progressMeter = launchMonitor.getProgressMeter(nn);
progressMeter.fullReset();
progressMeter.write(METER_TICK_CONFIRMING_NETWORK_LOCK);

if (!confirmNetLock(nn.getNetwork(), lock)) {
@@ -161,7 +163,7 @@ public class StandardNetworkService implements NetworkService {

progressMeter.write(METER_TICK_VALIDATING_NODE_NETWORK_AND_PLAN);
if (network.getState() != BubbleNetworkState.starting) {
if (network.getState() == BubbleNetworkState.stopped) {
if (network.getState().isStopped()) {
log.info("newNode: network was stopped, starting: "+network.getUuid());
networkDAO.update(network.setState(BubbleNetworkState.starting));
} else {
@@ -200,7 +202,8 @@ public class StandardNetworkService implements NetworkService {
return fatalLaunchFailure("newNode: peer limit reached ("+plan.getNodesIncluded()+")");
}

final CloudService cloud = findServiceOrDelegate(nn.getCloud());
final CloudAndRegion cloudAndRegion = geoService.selectCloudAndRegion(network, nn.getNetLocation(), nn.getExcludeRegions());
final CloudService cloud = findServiceOrDelegate(cloudAndRegion.getCloud().getUuid());
computeDriver = cloud.getComputeDriver(configuration);

final CloudService nodeCloud = cloudDAO.findByAccountAndName(network.getAccount(), cloud.getName());
@@ -222,7 +225,7 @@ public class StandardNetworkService implements NetworkService {
.setSizeType(network.getComputeSizeType())
.setSize(computeDriver.getSize(network.getComputeSizeType()).getInternalName())
.setCloud(nodeCloud.getUuid())
.setRegion(nn.getRegion()));
.setRegion(cloudAndRegion.getRegion().getInternalName()));

// if we are forking, we will be our own sage
final BubbleNode sageNode;
@@ -234,9 +237,6 @@ public class StandardNetworkService implements NetworkService {
sageNode = thisNode;
}

@Cleanup("delete") final TempDir automation = new TempDir();
final File bubbleFilesDir = mkdirOrDie(new File(abs(automation) + "/roles/bubble/files"));

// build automation directory for this run
final ValidationResult errors = new ValidationResult();

@@ -259,54 +259,34 @@ public class StandardNetworkService implements NetworkService {
try {
node.waitForIpAddresses();
} catch (TimeoutException e) {
launchFailureCanRetry(node, e, "waitForIpAddresses error");
}
progressMeter.write(METER_TICK_PREPARING_ROLES);
final Map<String, Object> ctx = ansiblePrep.prepAnsible(
automation, bubbleFilesDir, account, network, node, computeDriver,
errors, nn.fork(), nn.getRestoreKey());
if (errors.isInvalid()) {
progressMeter.error(METER_ERROR_ROLE_VALIDATION_ERRORS);
fatalLaunchFailure(node, new MultiViolationException(errors.getViolationBeans()));
final AwaitResult<Object> awaitResult = awaitAll(jobFutures, SECONDS.toMillis(5));
log.info("newNode: timeout waiting for IP addresses, awaitResult="+awaitResult);
if (awaitResult.allSucceeded()) {
// should not happen
return launchFailureCanRetry(node, e, "waitForIpAddresses timeout");
} else {
final Collection<Exception> failures = awaitResult.getFailures().values();
if (empty(failures)) {
return processLaunchException(node, "waitForIpAddresses");
} else {
for (Exception failure : failures) {
if (log.isDebugEnabled()) log.debug("newNode: failure=" + shortError(failure));
if (failure instanceof NodeJobException) {
final NodeJobException jobException = (NodeJobException) failure;
if (jobException.isComputeLocationUnavailable()) {
return unavailableRegion(node, failure, jobException.getMessage());
}
}
}
final Exception firstFailure = failures.iterator().next();
return launchFailureCanRetry(node, firstFailure, "waitForIpAddress timeout: " + shortError(firstFailure));
}
}
}

progressMeter.write(METER_TICK_PREPARING_INSTALL);
node.setState(BubbleNodeState.preparing_install);
nodeDAO.update(node);

// This node is on our network, or is the very first server. We must run ansible on it ourselves.
// write playbook file
writeFile(automation, ctx, PLAYBOOK_YML, PLAYBOOK_TEMPLATE);

// write inventory file
final File inventory = new File(automation, "hosts");
@Cleanup("delete") final TempDir automation = new TempDir();
final File sshKeyFile = packerService.getPackerPrivateKey();
toFile(inventory, "[bubble]\n127.0.0.1"
+ " ansible_python_interpreter=/usr/bin/python3\n");

// write SSH key, if present
if (network.hasSshKey()) {
final File sshPubKeyFile = new File(bubbleFilesDir, "admin_ssh_key.pub");
final AccountSshKey sshKey = sshKeyDAO.findByAccountAndId(network.getAccount(), network.getSshKey());
if (sshKey == null) throw invalidEx("err.sshPublicKey.notFound");
// add a newline before in case authorized_keys file does not end in a new line
// add a newline after so keys appended later will be OK
toFile(sshPubKeyFile, "\n"+sshKey.getSshPublicKey()+"\n");
}
copyScripts(bubbleFilesDir);

// write self_node.json file
writeFile(bubbleFilesDir, null, SELF_NODE_JSON, json(node
.setPlan(plan)
.setRestoreKey(nn.getRestoreKey())));

// write sage_node.json file
writeFile(bubbleFilesDir, null, SAGE_NODE_JSON, json(BubbleNode.sageMask(sageNode)));
writeFile(bubbleFilesDir, null, SAGE_KEY_JSON, json(BubbleNodeKey.sageMask(sageKey)));

// write install_local.sh script
final File installLocalScript = writeFile(automation, ctx, INSTALL_LOCAL_SH, INSTALL_LOCAL_TEMPLATE);
chmod(installLocalScript, "500");
prepareLaunchFiles(nn, computeDriver, node, progressMeter, network, sageKey, account, plan, sageNode, automation, errors, sshKeyFile);

// run ansible
final String sshArgs = "-o UserKnownHostsFile=/dev/null "
@@ -322,14 +302,19 @@ public class StandardNetworkService implements NetworkService {
log.info("newNode: awaiting background jobs...");
final AwaitResult<Object> awaitResult = awaitAll(jobFutures, NODE_START_JOB_TIMEOUT, NODE_START_JOB_AWAIT_SLEEP, new NodeLaunchAwait(progressMeter));
if (!awaitResult.allSucceeded()) {
log.warn("newNode: some background jobs failed: "+awaitResult.getFailures().values());
final Exception firstException = awaitResult.getFailures().values().iterator().next();
if (firstException instanceof NodeJobException) {
progressMeter.error(((NodeJobException) firstException).getMeterError());
log.warn("newNode: some background jobs failed, result="+ awaitResult);
final Collection<Exception> exceptions = awaitResult.getFailures().values();
if (exceptions.isEmpty()) {
return processLaunchException(node, "awaitingBeforeAnsible");
} else {
progressMeter.error(METER_START_OR_DNS_ERROR);
final Exception firstException = exceptions.iterator().next();
if (firstException instanceof NodeJobException) {
progressMeter.error(((NodeJobException) firstException).getMeterError());
} else {
progressMeter.error(METER_ERROR_START_OR_DNS);
}
return launchFailureCanRetry(node, "newNode: error in setup:" + exceptions);
}
return launchFailureCanRetry(node, "newNode: error in setup:" + awaitResult.getFailures().values());
}
waitForDebugger(script);
long configStart = now();
@@ -371,6 +356,7 @@ public class StandardNetworkService implements NetworkService {
while (now() - readyStart < NODE_READY_TIMEOUT) {
sleep(SECONDS.toMillis(2), "newNode: waiting for node (" + node.id() + ") to be ready");
log.debug("newNode: waiting for node (" + node.id() + ") to be ready via "+readyUri);
progressMeter.resetToPreAnsible();
progressMeter.write("READY-CHECK-"+(i++)+" /auth/ready for node: "+node.id());
launchMonitor.touch(network.getUuid());
try {
@@ -412,14 +398,26 @@ public class StandardNetworkService implements NetworkService {
log.info("newNode: ready in "+formatDuration(now() - start));

} catch (Exception e) {
killNode = node != null;
if (e instanceof SleepInterruptedException) {
log.warn("newNode: interrupted!");
} else {
log.error("newNode: " + e, e);
return launchInterrupted(node, e, "newNode: interrupted: "+shortError(e));

} else if (e instanceof NodeLaunchException) {
log.warn("newNode: NodeLaunchException: "+shortError(e));
throw (NodeLaunchException) e;

} else if (e instanceof UnavailableComputeLocationException) {
log.warn("newNode: UnavailableComputeLocationException: "+shortError(e));
if (progressMeter != null) {
if (nn.getNetLocation().exactRegion()) {
progressMeter.error(METER_ERROR_UNAVAILABLE_LOCATION);
} else {
progressMeter.error(METER_ERROR_RETRY_UNAVAILABLE_LOCATION);
}
}
return unavailableRegion(node, e, "newNode: unavailable location: "+shortError(e));
}
killNode = node != null;
if (e instanceof NodeLaunchException) throw (NodeLaunchException) e;
if (e instanceof SleepInterruptedException) launchInterrupted("newNode: interrupted: "+shortError(e));
return die("newNode: "+e, e);

} finally {
@@ -435,8 +433,12 @@ public class StandardNetworkService implements NetworkService {
if (node != null && (killNode || !node.isRunning())) {
node.setState(BubbleNodeState.unknown_error);
nodeDAO.update(node);
if (!progressMeter.hasError()) progressMeter.error(METER_UNKNOWN_ERROR);
if (!progressMeter.hasError()) progressMeter.error(METER_ERROR_UNKNOWN);
killNode(node, "error: node not running: "+node.id()+": "+node.getState());
if (!anyNodesActive(network)) {
log.warn("newNode: no nodes active, setting network state to 'stopped'");
networkDAO.update(network.setState(BubbleNetworkState.error_stopping));
}
}

if (progressMeter != null) {
@@ -449,13 +451,74 @@ public class StandardNetworkService implements NetworkService {
}
}
}
closeQuietly(progressMeter);
}
unlockNetwork(nn.getNetwork(), lock);
}
return node;
}

@NonNull private BubbleNode processLaunchException(BubbleNode node, String prefix) {
if (node.hasLaunchException()) {
final RuntimeException launchException = node.getLaunchException();
if (launchException instanceof UnavailableComputeLocationException) {
log.info("newNode: "+prefix+" received UnavailableComputeLocationException, throwing");
return unavailableRegion(node, launchException, shortError(launchException));
} else {
return launchFailureCanRetry(node, launchException, prefix+" launchException: "+shortError(launchException));
}
} else {
return launchFailureCanRetry(node, prefix+": waitForIpAddress timeout (no failures, no launch exception??)");
}
}

private void prepareLaunchFiles(@NonNull NewNodeNotification nn, ComputeServiceDriver computeDriver, BubbleNode node, NodeProgressMeter progressMeter, BubbleNetwork network, BubbleNodeKey sageKey, Account account, BubblePlan plan, BubbleNode sageNode, TempDir automation, ValidationResult errors, File sshKeyFile) throws IOException {
final File bubbleFilesDir = mkdirOrDie(new File(abs(automation) + "/roles/bubble/files"));
progressMeter.write(METER_TICK_PREPARING_ROLES);
final Map<String, Object> ctx = ansiblePrep.prepAnsible(
automation, bubbleFilesDir, account, network, node, computeDriver,
errors, nn.fork(), nn.getRestoreKey());
if (errors.isInvalid()) {
progressMeter.error(METER_ERROR_ROLE_VALIDATION_ERRORS);
fatalLaunchFailure(node, new MultiViolationException(errors.getViolationBeans()));
}

progressMeter.write(METER_TICK_PREPARING_INSTALL);
node.setState(BubbleNodeState.preparing_install);
nodeDAO.update(node);

// This node is on our network, or is the very first server. We must run ansible on it ourselves.
// write playbook file
writeFile(automation, ctx, PLAYBOOK_YML, PLAYBOOK_TEMPLATE);

// write inventory file
final File inventory = new File(automation, "hosts");
toFile(inventory, "[bubble]\n127.0.0.1"
+ " ansible_python_interpreter=/usr/bin/python3\n");

// write SSH key, if present
if (network.hasSshKey()) {
final File sshPubKeyFile = new File(bubbleFilesDir, "admin_ssh_key.pub");
final AccountSshKey sshKey = sshKeyDAO.findByAccountAndId(network.getAccount(), network.getSshKey());
if (sshKey == null) throw invalidEx("err.sshPublicKey.notFound");
// add a newline before in case authorized_keys file does not end in a new line
// add a newline after so keys appended later will be OK
toFile(sshPubKeyFile, "\n"+sshKey.getSshPublicKey()+"\n");
}
copyScripts(bubbleFilesDir);

// write self_node.json file
writeFile(bubbleFilesDir, null, SELF_NODE_JSON, json(node
.setPlan(plan)
.setRestoreKey(nn.getRestoreKey())));

// write sage_node.json file
writeFile(bubbleFilesDir, null, SAGE_NODE_JSON, json(BubbleNode.sageMask(sageNode)));
writeFile(bubbleFilesDir, null, SAGE_KEY_JSON, json(BubbleNodeKey.sageMask(sageKey)));

// write install_local.sh script
final File installLocalScript = writeFile(automation, ctx, INSTALL_LOCAL_SH, INSTALL_LOCAL_TEMPLATE);
chmod(installLocalScript, "500");
}

public void waitForDebugger(String script) {
if (Boolean.parseBoolean(configuration.getEnvironment().get(ENV_DEBUG_NODE_INSTALL))) {
final String msg = "waitForDebugger: debugging installation, waiting for " + abs(DEBUG_NODE_INSTALL_FILE) + " to exist";
@@ -639,12 +702,10 @@ public class StandardNetworkService implements NetworkService {
final CloudService dns = cloudDAO.findByUuid(domain.getPublicDns());
dns.getDnsDriver(configuration).setNetwork(network);

final CloudAndRegion cloudAndRegion = geoService.selectCloudAndRegion(network, netLocation);
final NewNodeNotification newNodeRequest = new NewNodeNotification()
.setFork(network.fork())
.setNodeHost(network)
.setCloud(cloudAndRegion.getCloud().getUuid())
.setRegion(cloudAndRegion.getRegion().getInternalName())
.setNetLocation(netLocation)
.setLock(lock);

// notify user that new bubble is launching
@@ -675,10 +736,14 @@ public class StandardNetworkService implements NetworkService {

public boolean noNodesActive(BubbleNetwork network) { return !anyNodesActive(network); }

public NewNodeNotification restoreNetwork(BubbleNetwork network, String cloud, String region, Request req) {
public NewNodeNotification restoreNetwork(BubbleNetwork network,
String cloud,
String region,
Boolean exactRegion,
Request req) {

final NetLocation netLocation = (region != null)
? NetLocation.fromCloudAndRegion(cloud, region)
? NetLocation.fromCloudAndRegion(cloud, region, exactRegion)
: NetLocation.fromIp(getRemoteHost(req));

String lock = null;
@@ -707,8 +772,7 @@ public class StandardNetworkService implements NetworkService {
final NewNodeNotification newNodeRequest = new NewNodeNotification()
.setNodeHost(network)
.setRestoreKey(restoreKey)
.setCloud(cloudAndRegion.getCloud().getUuid())
.setRegion(cloudAndRegion.getRegion().getInternalName())
.setNetLocation(netLocation)
.setLock(lock);

// start background process to create node


+ 4
- 1
bubble-server/src/main/java/bubble/service/cloud/job/NodeJobException.java View File

@@ -4,11 +4,12 @@
*/
package bubble.service.cloud.job;

import bubble.cloud.compute.UnavailableComputeLocationException;
import lombok.Getter;

public class NodeJobException extends RuntimeException {

@Getter private String meterError;
@Getter private final String meterError;

public NodeJobException(String meterError, String message, Exception e) {
super(message, e);
@@ -20,4 +21,6 @@ public class NodeJobException extends RuntimeException {
this.meterError = meterError;
}

public boolean isComputeLocationUnavailable () { return getCause() instanceof UnavailableComputeLocationException; }

}

+ 18
- 11
bubble-server/src/main/java/bubble/service/cloud/job/NodeStartJob.java View File

@@ -5,48 +5,55 @@
package bubble.service.cloud.job;

import bubble.cloud.compute.ComputeServiceDriver;
import bubble.cloud.compute.UnavailableComputeLocationException;
import bubble.model.cloud.BubbleNode;
import lombok.extern.slf4j.Slf4j;

import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_NO_IP;
import static bubble.service.cloud.NodeProgressMeterConstants.METER_ERROR_STARTING_NODE;
import static bubble.service.cloud.NodeProgressMeterConstants.*;
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;

@Slf4j
public class NodeStartJob implements Runnable {

private BubbleNode node;
// private final BubbleNodeDAO nodeDAO;
private final ComputeServiceDriver computeDriver;

public NodeStartJob(BubbleNode node,
// BubbleNodeDAO nodeDAO,
ComputeServiceDriver computeDriver) {
this.node = node;
// this.nodeDAO = nodeDAO;
this.computeDriver = computeDriver;
}

@Override public void run() {
try {
// node.setState(BubbleNodeState.booting);
// nodeDAO.update(node);

log.debug("run: calling computeDriver.start("+node.id()+")");
node.setLaunchException(null);
node = computeDriver.start(node);
log.debug("run: computeDriver.start("+node.id()+") returned successfully");

// node.setState(BubbleNodeState.booted);
// nodeDAO.update(node);

if (!node.hasIp4()) {
node.setLaunchException(new IllegalStateException("node booted but has no IP"));
throw new NodeJobException(METER_ERROR_NO_IP, "node booted but has no IP");
}

} catch (NodeJobException e) {
log.debug("run: computeDriver.start("+node.id()+") returning with launchError(NodeJobException): "+shortError(e));
node.setLaunchException(e);
throw e;

} catch (UnavailableComputeLocationException e) {
log.debug("run: computeDriver.start("+node.id()+") returning with launchError(UnavailableComputeLocationException): "+shortError(e));
node.setLaunchException(e);
throw new NodeJobException(METER_ERROR_UNAVAILABLE_LOCATION, "node cannot be launched at this location", e);

} catch (RuntimeException e) {
log.debug("run: computeDriver.start("+node.id()+") returning with launchError(RuntimeException): "+shortError(e));
node.setLaunchException(e);
throw new NodeJobException(METER_ERROR_STARTING_NODE, "error starting node: "+shortError(e), e);

} catch (Exception e) {
log.debug("run: computeDriver.start("+node.id()+") returning with launchError(Exception): "+shortError(e));
node.setLaunchException(new IllegalStateException(shortError(e), e));
throw new NodeJobException(METER_ERROR_STARTING_NODE, "error starting node: "+shortError(e), e);
}
}


+ 1
- 0
bubble-server/src/main/resources/logback.xml View File

@@ -55,6 +55,7 @@
<logger name="bubble.service.cloud.job" level="DEBUG" />
<logger name="bubble.service.cloud.NodeLauncher" level="DEBUG" />
<logger name="bubble.service.cloud.NodeService" level="DEBUG" />
<!-- <logger name="bubble.service.cloud.NodeProgressMeter" level="DEBUG" />-->
<logger name="bubble.cloud.compute.vultr" level="DEBUG" />
<logger name="bubble.resources.message" level="INFO" />
<logger name="bubble.app.analytics" level="DEBUG" />


+ 12
- 7
bubble-server/src/main/resources/message_templates/en_US/server/post_auth/ResourceMessages.properties View File

@@ -211,6 +211,9 @@ message_plan_max_accounts_multiple=You can create up to {{max}} user accounts on
message_plan_no_max_accounts=This plan supports an unlimited number of user accounts
field_label_show_advanced_plan_options=Customize Launch Options
field_label_region=Location
field_label_flex_region=Flexible location
field_label_flex_region_description=If your Bubble can't be launched in the selected location, it will be launched in the next closest region.
field_label_exact_region_description=If your Bubble can't be launched in the selected location, it will be not be launched. You can then choose a different location and relaunch it.
field_label_footprint=Footprint
field_label_network_ssh_key=SSH Key
field_description_network_ssh_key=You can SSH into the Bubble with this key
@@ -416,8 +419,7 @@ meter_tick_ready_check2=And what a lovely pie it is...
#meter_tick_ssh_keys=Setting up SSH keys

# Launch progress meter: success marker
meter_canceled=Bubble installation was canceled
meter_completed=Bubble installation completed successfully! On to your Bubble!
meter_completed_ok=Bubble installation completed successfully! On to your Bubble!

# Launch progress meter: errors
meter_error_confirming_network_lock=Error confirming network lock
@@ -427,14 +429,17 @@ meter_error_plan_not_enabled=Account plan is not enabled, cannot launch Bubble
meter_error_node_cloud_not_found=Compute cloud was not found, cannot launch Bubble
meter_error_bubble_jar_not_found=Bubble jar file was not found, cannot launch Bubble
meter_error_roles_not_found=Ansible roles were not found, cannot launch Bubble
meter_error_starting_node=Error starting node
meter_error_dns=Error setting DNS entries for node
meter_error_no_ip=Bubble node started, but does not have an IP address or SSH key, cannot install Bubble
meter_error_starting_node=Error starting Bubble
meter_error_unavailable_location=The target location is currently unavailable
meter_error_retry_unavailable_location=The target location is currently unavailable, trying another nearby location
meter_error_dns=Error setting DNS entries for Bubble
meter_error_no_ip=Bubble started, but does not have an IP address or SSH key, cannot install Bubble
meter_error_role_validation_errors=Validation of ansible roles failed, cannot install Bubble
meter_start_or_dns_error=Error starting node or writing DNS records
meter_error_canceled=Bubble installation was canceled
meter_error_start_or_dns=Error starting Bubble or writing DNS records

# Launch progress meter: catch-all for unknown/unmapped errors
meter_unknown_error=An unknown error occurred
meter_error_unknown=An unknown error occurred

# Help text shown during launch
title_launch_help_html=Next Steps


+ 8
- 8
bubble-server/src/test/java/bubble/mock/MockNetworkService.java View File

@@ -4,6 +4,7 @@
*/
package bubble.mock;

import bubble.cloud.CloudServiceType;
import bubble.cloud.compute.ComputeServiceDriver;
import bubble.cloud.compute.mock.MockComputeDriver;
import bubble.dao.cloud.BubbleNetworkDAO;
@@ -17,8 +18,6 @@ import bubble.service.cloud.StandardNetworkService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import static org.cobbzilla.util.daemon.ZillaRuntime.die;

@Service
public class MockNetworkService extends StandardNetworkService {

@@ -34,12 +33,13 @@ public class MockNetworkService extends StandardNetworkService {
@Override public BubbleNode newNode(NewNodeNotification nn, NodeLaunchMonitor launchMonitor) {

final BubbleNetwork network = networkDAO.findByUuid(nn.getNetwork());
final CloudService cloud = findServiceOrDelegate(nn.getCloud());
final CloudService nodeCloud = cloudDAO.findByAccountAndName(network.getAccount(), cloud.getName());
if (nodeCloud == null) return die("newNode: node cloud not found: "+cloud.getName()+" for account "+network.getAccount());
final CloudService cloud = cloudDAO.findByAccountAndType(network.getAccount(), CloudServiceType.compute)
.stream()
.filter(c -> c.getDriverClass().equals(MockComputeDriver.class.getName()))
.findFirst()
.orElseThrow();

final ComputeServiceDriver computeDriver = cloud.getComputeDriver(configuration);
if (!(computeDriver instanceof MockComputeDriver)) return die("newNode: expected MockComputeDriver");

final BubbleNode node = nodeDAO.create(new BubbleNode()
.setHost(nn.getHost())
@@ -52,8 +52,8 @@ public class MockNetworkService extends StandardNetworkService {
.setAccount(network.getAccount())
.setSizeType(network.getComputeSizeType())
.setSize(computeDriver.getSize(network.getComputeSizeType()).getInternalName())
.setCloud(nodeCloud.getUuid())
.setRegion(nn.getRegion()));
.setCloud(cloud.getUuid())
.setRegion(nn.getNetLocation().getRegion()));

network.setState(BubbleNetworkState.running);
networkDAO.update(network);


+ 1
- 1
bubble-web

@@ -1 +1 @@
Subproject commit 9c991ff6978ad469f9582a72af96ee8b1a22539a
Subproject commit e2fe4f47113b5d1a53efaacc59ca3ec42f5449ae

+ 1
- 1
utils/cobbzilla-utils

@@ -1 +1 @@
Subproject commit 946d62be17b43672695c780026d8a742ca03ce0e
Subproject commit b2f8b5e68710fcae126c733563d9938db6fe5b80

Loading…
Cancel
Save