details: http://www.bx.psu.edu/hg/galaxy/rev/e19eef93584f changeset: 3078:e19eef93584f user: Enis Afgan <afgane@gmail.com> date: Thu Oct 29 17:40:31 2009 -0400 description: Added good bit of exception try/catch blocks in providers' code. Started on addressing zombie UCIs. diffstat: lib/galaxy/cloud/__init__.py | 9 +- lib/galaxy/cloud/providers/ec2.py | 272 +++++++++++++++++++++------------- lib/galaxy/cloud/providers/eucalyptus.py | 283 ++++++++++++++++++++++------------ lib/galaxy/web/controllers/cloud.py | 28 +-- templates/cloud/add_provider.mako | 3 - templates/cloud/configure_cloud.mako | 9 +- templates/cloud/configure_uci.mako | 4 +- templates/cloud/viewInstance.mako | 7 +- 8 files changed, 382 insertions(+), 233 deletions(-) diffs (918 lines): diff -r be631ed97541 -r e19eef93584f lib/galaxy/cloud/__init__.py --- a/lib/galaxy/cloud/__init__.py Thu Oct 29 11:46:43 2009 -0400 +++ b/lib/galaxy/cloud/__init__.py Thu Oct 29 17:40:31 2009 -0400 @@ -34,8 +34,6 @@ CREATING = "creating" ) -JOB_WAIT, JOB_ERROR, JOB_INPUT_ERROR, JOB_INPUT_DELETED, JOB_OK, JOB_READY, JOB_DELETED, JOB_ADMIN_DELETED = 'wait', 'error', 'input_error', 'input_deleted', 'ok', 'ready', 'deleted', 'admin_deleted' - class CloudManager( object ): """ Highest level interface to cloud management. @@ -318,13 +316,16 @@ vol.i_id = instance_id vol.flush() - def set_error( self, error ): + def set_error( self, error, set_state=False ): """ - Sets error field of given UCI in local Galaxy database + Sets error field of given UCI in local Galaxy database. If set_state is set to 'true', + method also sets state of give UCI to 'error' """ uci = model.UCI.get( self.uci_id ) uci.refresh() uci.error = error + if set_state: + uci.state = uci_states.ERROR uci.flush() # --------- Getter methods ----------------- diff -r be631ed97541 -r e19eef93584f lib/galaxy/cloud/providers/ec2.py --- a/lib/galaxy/cloud/providers/ec2.py Thu Oct 29 11:46:43 2009 -0400 +++ b/lib/galaxy/cloud/providers/ec2.py Thu Oct 29 17:40:31 2009 -0400 @@ -12,6 +12,7 @@ import galaxy.eggs galaxy.eggs.require("boto") from boto.ec2.connection import EC2Connection +from boto.ec2.regioninfo import RegionInfo import boto.exception import logging @@ -93,12 +94,23 @@ """ log.debug( '##### Establishing EC2 cloud connection' ) provider = uci_wrapper.get_provider() - region = RegionInfo( None, provider.region_name, provider.region_endpoint ) - conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(), - aws_secret_access_key=uci_wrapper.get_secret_key(), - is_secure=provider.is_secure, - region=region, - path=provider.path ) + try: + region = RegionInfo( None, provider.region_name, provider.region_endpoint ) + except Exception, e: + log.error( "Selecting region with cloud provider failed: %s" % str(e) ) + uci_wrapper.set_error( "Selecting region with cloud provider failed: " + str(e), True ) + return None + try: + conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(), + aws_secret_access_key=uci_wrapper.get_secret_key(), + is_secure=provider.is_secure, + region=region, + path=provider.path ) + except Exception, e: + log.error( "Establishing connection with cloud failed: %s" % str(e) ) + uci_wrapper.set_error( "Establishing connection with cloud failed: " + str(e), True ) + return None + return conn def set_keypair( self, uci_wrapper, conn ): @@ -120,7 +132,7 @@ uci_wrapper.set_key_pair( inst, kp.name, kp.material ) else: log.error( "EC2 response error: '%s'" % e ) - uci_wrapper.set_error( "EC2 response error while creating key pair: " + e ) + uci_wrapper.set_error( "EC2 response error while creating key pair: " + str(e), True ) return kp.name @@ -235,53 +247,65 @@ Starts instance(s) of given UCI on the cloud. """ conn = self.get_connection( uci_wrapper ) -# - self.set_keypair( uci_wrapper, conn ) - i_indexes = uci_wrapper.get_instances_indexes() # Get indexes of i_indexes associated with this UCI whose state is 'None' - log.debug( "Starting instances with IDs: '%s' associated with UCI '%s' " % ( uci_wrapper.get_name(), i_indexes ) ) - for i_index in i_indexes: - mi_id = self.get_mi_id( uci_wrapper.get_type( i_index ) ) - uci_wrapper.set_mi( i_index, mi_id ) + if uci_wrapper.get_state() != uci_states.ERROR: + self.set_keypair( uci_wrapper, conn ) + i_indexes = uci_wrapper.get_instances_indexes() # Get indexes of i_indexes associated with this UCI whose state is 'None' + log.debug( "Starting instances with IDs: '%s' associated with UCI '%s' " % ( uci_wrapper.get_name(), i_indexes ) ) - # Check if galaxy security group exists (and create it if it does not) - security_group = 'galaxyWeb' - log.debug( "Setting up '%s' security group." % security_group ) - sgs = conn.get_all_security_groups() # security groups - gsgt = False # galaxy security group test - for sg in sgs: - if sg.name == security_group: - gsgt = True - # If security group does not exist, create it - if not gsgt: - gSecurityGroup = conn.create_security_group(security_group, 'Security group for Galaxy.') - gSecurityGroup.authorize( 'tcp', 80, 80, '0.0.0.0/0' ) # Open HTTP port - gSecurityGroup.authorize( 'tcp', 22, 22, '0.0.0.0/0' ) # Open SSH port - # Start an instance - log.debug( "***** Starting instance for UCI '%s'" % uci_wrapper.get_name() ) - #TODO: Once multiple volumes can be attached to a single instance, update 'userdata' composition - userdata = uci_wrapper.get_store_volume_id()+"|"+uci_wrapper.get_access_key()+"|"+uci_wrapper.get_secret_key() - log.debug( 'Using following command: conn.run_instances( image_id=%s, key_name=%s, security_groups=[%s], user_data=[OMITTED], instance_type=%s, placement=%s )' - % ( mi_id, uci_wrapper.get_key_pair_name( i_index ), [security_group], uci_wrapper.get_type( i_index ), uci_wrapper.get_uci_availability_zone() ) ) - reservation = conn.run_instances( image_id=mi_id, - key_name=uci_wrapper.get_key_pair_name( i_index ), - security_groups=[security_group], - user_data=userdata, - instance_type=uci_wrapper.get_type( i_index ), - placement=uci_wrapper.get_uci_availability_zone() ) - # Record newly available instance data into local Galaxy database - l_time = datetime.utcnow() - uci_wrapper.set_launch_time( l_time, i_index=i_index ) # format_time( reservation.i_indexes[0].launch_time ) ) - if not uci_wrapper.uci_launch_time_set(): - uci_wrapper.set_uci_launch_time( l_time ) - uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] ) - # TODO: if more than a single instance will be started through single reservation, change this reference to element [0] - i_id = str( reservation.instances[0]).split(":")[1] - uci_wrapper.set_instance_id( i_index, i_id ) - s = reservation.instances[0].state - uci_wrapper.change_state( s, i_id, s ) - log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_state() ) ) - + if uci_wrapper.get_state() != uci_states.ERROR: + for i_index in i_indexes: + mi_id = self.get_mi_id( uci_wrapper.get_type( i_index ) ) + uci_wrapper.set_mi( i_index, mi_id ) + + # Check if galaxy security group exists (and create it if it does not) + security_group = 'galaxyWeb' + log.debug( "Setting up '%s' security group." % security_group ) + sgs = conn.get_all_security_groups() # security groups + gsgt = False # galaxy security group test + for sg in sgs: + if sg.name == security_group: + gsgt = True + # If security group does not exist, create it + if not gsgt: + gSecurityGroup = conn.create_security_group(security_group, 'Security group for Galaxy.') + gSecurityGroup.authorize( 'tcp', 80, 80, '0.0.0.0/0' ) # Open HTTP port + gSecurityGroup.authorize( 'tcp', 22, 22, '0.0.0.0/0' ) # Open SSH port + + if uci_wrapper.get_state() != uci_states.ERROR: + # Start an instance + log.debug( "***** Starting instance for UCI '%s'" % uci_wrapper.get_name() ) + #TODO: Once multiple volumes can be attached to a single instance, update 'userdata' composition + userdata = uci_wrapper.get_store_volume_id()+"|"+uci_wrapper.get_access_key()+"|"+uci_wrapper.get_secret_key() + log.debug( 'Using following command: conn.run_instances( image_id=%s, key_name=%s, security_groups=[%s], user_data=[OMITTED], instance_type=%s, placement=%s )' + % ( mi_id, uci_wrapper.get_key_pair_name( i_index ), [security_group], uci_wrapper.get_type( i_index ), uci_wrapper.get_uci_availability_zone() ) ) + try: + reservation = conn.run_instances( image_id=mi_id, + key_name=uci_wrapper.get_key_pair_name( i_index ), + security_groups=[security_group], + user_data=userdata, + instance_type=uci_wrapper.get_type( i_index ), + placement=uci_wrapper.get_uci_availability_zone() ) + except boto.exception.EC2ResponseError, e: + log.error( "EC2 response error when starting UCI '%s': '%s'" % ( uci_wrapper.get_name(), str(e) ) ) + uci_wrapper.set_error( "EC2 response error when starting: " + str(e), True ) + # Record newly available instance data into local Galaxy database + l_time = datetime.utcnow() + uci_wrapper.set_launch_time( l_time, i_index=i_index ) # format_time( reservation.i_indexes[0].launch_time ) ) + if not uci_wrapper.uci_launch_time_set(): + uci_wrapper.set_uci_launch_time( l_time ) + try: + uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] ) + # TODO: if more than a single instance will be started through single reservation, change this reference to element [0] + i_id = str( reservation.instances[0]).split(":")[1] + uci_wrapper.set_instance_id( i_index, i_id ) + s = reservation.instances[0].state + uci_wrapper.change_state( s, i_id, s ) + log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_state() ) ) + except boto.exception.EC2ResponseError, e: + log.error( "EC2 response error when retrieving instance information for UCI '%s': '%s'" % ( uci_wrapper.get_name(), str(e) ) ) + uci_wrapper.set_error( "EC2 response error when retrieving instance information: " + str(e), True ) + def stopUCI( self, uci_wrapper): """ Stops all of cloud instances associated with given UCI. @@ -355,8 +379,8 @@ def update( self ): """ - Runs a global status update on all instances that are in 'running', 'pending', "creating", or 'shutting-down' state. - Also, runs update on all storage volumes that are in "in-use", "creating", or 'None' state. + Runs a global status update on all instances that are in 'running', 'pending', or 'shutting-down' state. + Also, runs update on all storage volumes that are in 'in-use', 'creating', or 'None' state. Reason behind this method is to sync state of local DB and real-world resources """ log.debug( "Running general status update for EC2 UCIs..." ) @@ -385,14 +409,28 @@ a_key = uci.credentials.access_key s_key = uci.credentials.secret_key # Get connection - region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) - conn = EC2Connection( aws_access_key_id=a_key, - aws_secret_access_key=s_key, - is_secure=uci.credentials.provider.is_secure, - region=region, - path=uci.credentials.provider.path ) + try: + region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) + conn = EC2Connection( aws_access_key_id=a_key, + aws_secret_access_key=s_key, + is_secure=uci.credentials.provider.is_secure, + region=region, + path=uci.credentials.provider.path ) + except boto.exception.EC2ResponseError, e: + log.error( "Establishing connection with cloud failed: %s" % str(e) ) + uci.error( "Establishing connection with cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # Get reservations handle for given instance - rl= conn.get_all_instances( [inst.instance_id] ) + try: + rl= conn.get_all_instances( [inst.instance_id] ) + except boto.exception.EC2ResponseError, e: + log.error( "Retrieving instance(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Retrieving instance(s) from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # Because EPC deletes references to reservations after a short while after instances have terminated, getting an empty list as a response to a query # typically means the instance has successfully shut down but the check was not performed in short enough amount of time. Until alternative solution # is found, below code sets state of given UCI to 'error' to indicate to the user something out of ordinary happened. @@ -409,27 +447,33 @@ # Update instance status in local DB with info from cloud provider for r in rl: for i, cInst in enumerate( r.instances ): - s = cInst.update() - log.debug( "Checking state of cloud instance '%s' associated with UCI '%s' and reservation '%s'. State='%s'" % ( cInst, uci.name, r, s ) ) - if s != inst.state: - inst.state = s - inst.flush() - # After instance has shut down, ensure UCI is marked as 'available' - if s == instance_states.TERMINATED and uci.state != uci_states.ERROR: - uci.state = uci_states.AVAILABLE - uci.launch_time = None - uci.flush() - # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed. - if s != uci.state and s != instance_states.TERMINATED: - uci.state = s - uci.flush() - if cInst.public_dns_name != inst.public_dns: - inst.public_dns = cInst.public_dns_name - inst.flush() - if cInst.private_dns_name != inst.private_dns: - inst.private_dns = cInst.private_dns_name - inst.flush() - + try: + s = cInst.update() + log.debug( "Checking state of cloud instance '%s' associated with UCI '%s' and reservation '%s'. State='%s'" % ( cInst, uci.name, r, s ) ) + if s != inst.state: + inst.state = s + inst.flush() + # After instance has shut down, ensure UCI is marked as 'available' + if s == instance_states.TERMINATED and uci.state != uci_states.ERROR: + uci.state = uci_states.AVAILABLE + uci.launch_time = None + uci.flush() + # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed. + if s != uci.state and s != instance_states.TERMINATED: + uci.state = s + uci.flush() + if cInst.public_dns_name != inst.public_dns: + inst.public_dns = cInst.public_dns_name + inst.flush() + if cInst.private_dns_name != inst.private_dns: + inst.private_dns = cInst.private_dns_name + inst.flush() + except boto.exception.EC2ResponseError, e: + log.error( "Updating status of instance(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Updating instance status from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + def updateStore( self, store ): # Get credentials associated wit this store uci_id = store.uci_id @@ -438,35 +482,55 @@ a_key = uci.credentials.access_key s_key = uci.credentials.secret_key # Get connection - region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) - conn = EC2Connection( aws_access_key_id=a_key, + try: + region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) + conn = EC2Connection( aws_access_key_id=a_key, aws_secret_access_key=s_key, is_secure=uci.credentials.provider.is_secure, region=region, path=uci.credentials.provider.path ) + except boto.exception.EC2ResponseError, e: + log.error( "Establishing connection with cloud failed: %s" % str(e) ) + uci.error( "Establishing connection with cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # Get reservations handle for given store - vl = conn.get_all_volumes( [store.volume_id] ) + try: + vl = conn.get_all_volumes( [store.volume_id] ) # log.debug( "Store '%s' vl: '%s'" % ( store.volume_id, vl ) ) + except boto.exception.EC2ResponseError, e: + log.error( "Retrieving volume(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Retrieving volume(s) from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # Update store status in local DB with info from cloud provider - if store.status != vl[0].status: - # In case something failed during creation of UCI but actual storage volume was created and yet - # UCI state remained as 'new', try to remedy this by updating UCI state here - if ( store.status == None ) and ( store.volume_id != None ): - uci.state = vl[0].status - uci.flush() - - store.status = vl[0].status - store.flush() - if store.i_id != vl[0].instance_id: - store.i_id = vl[0].instance_id - store.flush() - if store.attach_time != vl[0].attach_time: - store.attach_time = vl[0].attach_time - store.flush() - if store.device != vl[0].device: - store.device = vl[0].device - store.flush() - + try: + if store.status != vl[0].status: + # In case something failed during creation of UCI but actual storage volume was created and yet + # UCI state remained as 'new', try to remedy this by updating UCI state here + if ( store.status == None ) and ( store.volume_id != None ): + uci.state = vl[0].status + uci.flush() + + store.status = vl[0].status + store.flush() + if store.i_id != vl[0].instance_id: + store.i_id = vl[0].instance_id + store.flush() + if store.attach_time != vl[0].attach_time: + store.attach_time = vl[0].attach_time + store.flush() + if store.device != vl[0].device: + store.device = vl[0].device + store.flush() + except boto.exception.EC2ResponseError, e: + log.error( "Updating status of volume(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Updating volume status from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # def updateUCI( self, uci ): # """ # Runs a global status update on all storage volumes and all instances that are diff -r be631ed97541 -r e19eef93584f lib/galaxy/cloud/providers/eucalyptus.py --- a/lib/galaxy/cloud/providers/eucalyptus.py Thu Oct 29 11:46:43 2009 -0400 +++ b/lib/galaxy/cloud/providers/eucalyptus.py Thu Oct 29 17:40:31 2009 -0400 @@ -7,7 +7,7 @@ from galaxy.datatypes.data import nice_size from galaxy.util.bunch import Bunch from Queue import Queue -from sqlalchemy import or_ +from sqlalchemy import or_, and_ import galaxy.eggs galaxy.eggs.require("boto") @@ -69,9 +69,8 @@ """Run the next job, waiting until one is available if necessary""" cnt = 0 while 1: - + #log.debug( '[%d] run_next->queue.qsize(): %s' % ( cnt, self.queue.qsize() ) ) uci_wrapper = self.queue.get() - log.debug( '[%d] uci type: %s' % ( cnt, uci_wrapper.get_name() ) ) uci_state = uci_wrapper.get_state() if uci_state is self.STOP_SIGNAL: return @@ -82,6 +81,7 @@ self.deleteUCI( uci_wrapper ) elif uci_state==uci_states.SUBMITTED: self.startUCI( uci_wrapper ) + #self.dummyStartUCI( uci_wrapper ) elif uci_state==uci_states.SHUTTING_DOWN: self.stopUCI( uci_wrapper ) except: @@ -94,13 +94,25 @@ """ log.debug( '##### Establishing eucalyptus cloud connection' ) provider = uci_wrapper.get_provider() - euca_region = RegionInfo( None, provider.region_name, provider.region_endpoint ) - conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(), - aws_secret_access_key=uci_wrapper.get_secret_key(), - is_secure=provider.is_secure, - port=provider.port, - region=euca_region, - path=provider.path ) + try: + euca_region = RegionInfo( None, provider.region_name, provider.region_endpoint ) + except Exception, e: + log.error( "Selecting region with cloud provider failed: %2" % str(e) ) + uci_wrapper.set_error( "Selecting region with cloud provider failed: " + str(e), True ) + return None + + try: + conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(), + aws_secret_access_key=uci_wrapper.get_secret_key(), + is_secure=provider.is_secure, + port=provider.port, + region=euca_region, + path=provider.path ) + except boto.exception.EC2ResponseError, e: + log.error( "Establishing connection with cloud failed: %s" % str(e) ) + uci_wrapper.set_error( "Establishing connection with cloud failed: " + str(e), True ) + return None + return conn def set_keypair( self, uci_wrapper, conn ): @@ -108,20 +120,22 @@ Generate keypair using user's default credentials """ log.debug( "Getting user's keypair: '%s'" % self.key_pair ) - kp = conn.get_key_pair( self.key_pair ) instances = uci_wrapper.get_instances_indexes() - try: + kp = conn.get_key_pair( self.key_pair ) for inst in instances: - log.debug("inst: '%s'" % inst ) uci_wrapper.set_key_pair( inst, kp.name ) return kp.name - except AttributeError: # No keypair under this name exists so create it - log.info( "No keypair found, creating keypair '%s'" % self.key_pair ) - kp = conn.create_key_pair( self.key_pair ) - for inst in instances: - uci_wrapper.set_key_pair( inst, kp.name, kp.material ) - + except boto.exception.EC2ResponseError, e: # No keypair under this name exists so create it + if e.code == 'InvalidKeyPair.NotFound': + log.info( "No keypair found, creating keypair '%s'" % self.key_pair ) + kp = conn.create_key_pair( self.key_pair ) + for inst in instances: + uci_wrapper.set_key_pair( inst, kp.name, kp.material ) + else: + log.error( "EC2 response error: '%s'" % e ) + uci_wrapper.set_error( "EC2 response error while creating key pair: " + str(e), True ) + return kp.name def get_mi_id( self, type ): @@ -208,10 +222,11 @@ uci = uci_wrapper.get_uci() log.debug( "Would be starting instance '%s'" % uci.name ) - uci_wrapper.change_state( uci_states.PENDING ) -# log.debug( "Sleeping a bit... (%s)" % uci.name ) -# time.sleep(20) -# log.debug( "Woke up! (%s)" % uci.name ) +# uci_wrapper.change_state( uci_states.SUBMITTED_UCI ) +# log.debug( "Set UCI state to SUBMITTED_UCI" ) + log.debug( "Sleeping a bit... (%s)" % uci.name ) + time.sleep(10) + log.debug( "Woke up! (%s)" % uci.name ) def startUCI( self, uci_wrapper ): """ @@ -219,30 +234,43 @@ """ conn = self.get_connection( uci_wrapper ) # - self.set_keypair( uci_wrapper, conn ) + if uci_wrapper.get_state() != uci_states.ERROR: + self.set_keypair( uci_wrapper, conn ) i_indexes = uci_wrapper.get_instances_indexes() # Get indexes of i_indexes associated with this UCI - for i_index in i_indexes: - mi_id = self.get_mi_id( uci_wrapper.get_type( i_index ) ) - log.debug( "mi_id: %s, uci_wrapper.get_key_pair_name( i_index ): %s" % ( mi_id, uci_wrapper.get_key_pair_name( i_index ) ) ) - uci_wrapper.set_mi( i_index, mi_id ) + if uci_wrapper.get_state() != uci_states.ERROR: + for i_index in i_indexes: + mi_id = self.get_mi_id( uci_wrapper.get_type( i_index ) ) + log.debug( "mi_id: %s, uci_wrapper.get_key_pair_name( i_index ): %s" % ( mi_id, uci_wrapper.get_key_pair_name( i_index ) ) ) + uci_wrapper.set_mi( i_index, mi_id ) - log.debug( "***** Starting UCI instance '%s'" % uci_wrapper.get_name() ) - log.debug( 'Using following command: conn.run_instances( image_id=%s, key_name=%s )' % ( mi_id, uci_wrapper.get_key_pair_name( i_index ) ) ) - reservation = conn.run_instances( image_id=mi_id, key_name=uci_wrapper.get_key_pair_name( i_index ) ) - #reservation = conn.run_instances( image_id=instance.image, key_name=instance.keypair_name, security_groups=['galaxy'], instance_type=instance.type, placement=instance.availability_zone ) - l_time = datetime.utcnow() - uci_wrapper.set_launch_time( l_time, i_index=i_index ) # format_time( reservation.i_indexes[0].launch_time ) ) - if not uci_wrapper.uci_launch_time_set(): - uci_wrapper.set_uci_launch_time( l_time ) - uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] ) - # TODO: if more than a single instance will be started through single reservation, change this reference to element [0] - i_id = str( reservation.instances[0]).split(":")[1] - uci_wrapper.set_instance_id( i_index, i_id ) - s = reservation.instances[0].state - uci_wrapper.change_state( s, i_id, s ) - log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_state() ) ) + if uci_wrapper.get_state() != uci_states.ERROR: + log.debug( "***** Starting UCI instance '%s'" % uci_wrapper.get_name() ) + log.debug( 'Using following command: conn.run_instances( image_id=%s, key_name=%s )' % ( mi_id, uci_wrapper.get_key_pair_name( i_index ) ) ) + try: + reservation = conn.run_instances( image_id=mi_id, key_name=uci_wrapper.get_key_pair_name( i_index ) ) + #reservation = conn.run_instances( image_id=instance.image, key_name=instance.keypair_name, security_groups=['galaxy'], instance_type=instance.type, placement=instance.availability_zone ) + except boto.exception.EC2ResponseError, e: + log.error( "EC2 response error when starting UCI '%s': '%s'" % ( uci_wrapper.get_name(), str(e) ) ) + uci_wrapper.set_error( "EC2 response error when starting: " + str(e), True ) + + l_time = datetime.utcnow() + uci_wrapper.set_launch_time( l_time, i_index=i_index ) # format_time( reservation.i_indexes[0].launch_time ) ) + if not uci_wrapper.uci_launch_time_set(): + uci_wrapper.set_uci_launch_time( l_time ) + try: + uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] ) + # TODO: if more than a single instance will be started through single reservation, change this reference to element [0] + i_id = str( reservation.instances[0]).split(":")[1] + uci_wrapper.set_instance_id( i_index, i_id ) + s = reservation.instances[0].state + uci_wrapper.change_state( s, i_id, s ) + log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_state() ) ) + except boto.exception.EC2ResponseError, e: + log.error( "EC2 response error when retrieving instance information for UCI '%s': '%s'" % ( uci_wrapper.get_name(), str(e) ) ) + uci_wrapper.set_error( "EC2 response error when retrieving instance information: " + str(e), True ) + def stopUCI( self, uci_wrapper): """ @@ -316,11 +344,12 @@ def update( self ): """ - Runs a global status update on all instances that are in 'running', 'pending', "creating", or 'shutting-down' state. - Also, runs update on all storage volumes that are in "in-use", "creating", or 'None' state. + Runs a global status update on all instances that are in 'running', 'pending', or 'shutting-down' state. + Also, runs update on all storage volumes that are in 'in-use', 'creating', or 'None' state. Reason behind this method is to sync state of local DB and real-world resources """ log.debug( "Running general status update for EPC UCIs..." ) + # Update instances instances = model.CloudInstance.filter( or_( model.CloudInstance.c.state==instance_states.RUNNING, model.CloudInstance.c.state==instance_states.PENDING, model.CloudInstance.c.state==instance_states.SHUTTING_DOWN ) ).all() @@ -328,7 +357,8 @@ if self.type == inst.uci.credentials.provider.type: log.debug( "[%s] Running general status update on instance '%s'" % ( inst.uci.credentials.provider.type, inst.instance_id ) ) self.updateInstance( inst ) - + + # Update storage volume(s) stores = model.CloudStore.filter( or_( model.CloudStore.c.status==store_states.IN_USE, model.CloudStore.c.status==store_states.CREATING, model.CloudStore.c.status==None ) ).all() @@ -337,6 +367,20 @@ log.debug( "[%s] Running general status update on store '%s'" % ( store.uci.credentials.provider.type, store.volume_id ) ) self.updateStore( store ) + # Attempt at updating any zombie UCIs (i.e., instances that have been in SUBMITTED state for longer than expected - see below for exact time) + zombies = model.UCI.filter_by( state=uci_states.SUBMITTED ).all() + for zombie in zombies: + z_instances = model.CloudInstance.filter_by( uci_id=zombie.id) \ + .filter( or_( model.CloudInstance.c.state!=instance_states.TERMINATED, + model.CloudInstance.c.state == None ) ) \ + .all() + for z_inst in z_instances: + if self.type == z_inst.uci.credentials.provider.type: + log.debug( "z_inst.id: '%s', state: '%s'" % ( z_inst.id, z_inst.state ) ) +# td = datetime.utcnow() - zombie.update_time +# if td.seconds > 180: # if instance has been in SUBMITTED state for more than 3 minutes +# log.debug( "[%s] Running zombie repair update on instance '%s'" % ( inst.uci.credentials.provider.type, inst.id ) ) + def updateInstance( self, inst ): # Get credentials associated wit this instance @@ -346,15 +390,29 @@ a_key = uci.credentials.access_key s_key = uci.credentials.secret_key # Get connection - euca_region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) - conn = EC2Connection( aws_access_key_id=a_key, - aws_secret_access_key=s_key, - is_secure=uci.credentials.provider.is_secure, - port=uci.credentials.provider.port, - region=euca_region, - path=uci.credentials.provider.path ) + try: + euca_region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) + conn = EC2Connection( aws_access_key_id=a_key, + aws_secret_access_key=s_key, + is_secure=uci.credentials.provider.is_secure, + port=uci.credentials.provider.port, + region=euca_region, + path=uci.credentials.provider.path ) + except boto.exception.EC2ResponseError, e: + log.error( "Establishing connection with cloud failed: %s" % str(e) ) + uci.error( "Establishing connection with cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # Get reservations handle for given instance - rl= conn.get_all_instances( [inst.instance_id] ) + try: + rl= conn.get_all_instances( [inst.instance_id] ) + except boto.exception.EC2ResponseError, e: + log.error( "Retrieving instance(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Retrieving instance(s) from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # Because EPC deletes references to reservations after a short while after instances have terminated, getting an empty list as a response to a query # typically means the instance has successfully shut down but the check was not performed in short enough amount of time. Until alternative solution # is found, below code sets state of given UCI to 'error' to indicate to the user something out of ordinary happened. @@ -371,26 +429,32 @@ # Update instance status in local DB with info from cloud provider for r in rl: for i, cInst in enumerate( r.instances ): - s = cInst.update() - log.debug( "Checking state of cloud instance '%s' associated with reservation '%s'. State='%s'" % ( cInst, r, s ) ) - if s != inst.state: - inst.state = s + try: + s = cInst.update() + log.debug( "Checking state of cloud instance '%s' associated with reservation '%s'. State='%s'" % ( cInst, r, s ) ) + if s != inst.state: + inst.state = s + inst.flush() + # After instance has shut down, ensure UCI is marked as 'available' + if s == instance_states.TERMINATED and uci.state != uci_states.ERROR: + uci.state = uci_states.AVAILABLE + uci.launch_time = None + uci.flush() + # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed. + if s != uci.state and s != instance_states.TERMINATED: + uci.state = s + uci.flush() + if cInst.public_dns_name != inst.public_dns: + inst.public_dns = cInst.public_dns_name + inst.flush() + if cInst.private_dns_name != inst.private_dns: + inst.private_dns = cInst.private_dns_name inst.flush() - # After instance has shut down, ensure UCI is marked as 'available' - if s == instance_states.TERMINATED and uci.state != uci_states.ERROR: - uci.state = uci_states.AVAILABLE - uci.launch_time = None - uci.flush() - # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed. - if s != uci.state and s != instance_states.TERMINATED: - uci.state = s - uci.flush() - if cInst.public_dns_name != inst.public_dns: - inst.public_dns = cInst.public_dns_name - inst.flush() - if cInst.private_dns_name != inst.private_dns: - inst.private_dns = cInst.private_dns_name - inst.flush() + except boto.exception.EC2ResponseError, e: + log.error( "Updating status of instance(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Updating volume status from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None def updateStore( self, store ): # Get credentials associated wit this store @@ -400,34 +464,53 @@ a_key = uci.credentials.access_key s_key = uci.credentials.secret_key # Get connection - euca_region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) - conn = EC2Connection( aws_access_key_id=a_key, - aws_secret_access_key=s_key, - is_secure=uci.credentials.provider.is_secure, - port=uci.credentials.provider.port, - region=euca_region, - path=uci.credentials.provider.path )# Get reservations handle for given store - vl = conn.get_all_volumes( [store.volume_id] ) + try: + euca_region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) + conn = EC2Connection( aws_access_key_id=a_key, + aws_secret_access_key=s_key, + is_secure=uci.credentials.provider.is_secure, + port=uci.credentials.provider.port, + region=euca_region, + path=uci.credentials.provider.path ) + except boto.exception.EC2ResponseError, e: + log.error( "Establishing connection with cloud failed: %s" % str(e) ) + uci.error( "Establishing connection with cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + + try: + vl = conn.get_all_volumes( [store.volume_id] ) + except boto.exception.EC2ResponseError, e: + log.error( "Retrieving volume(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Retrieving volume(s) from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None + # Update store status in local DB with info from cloud provider - if store.status != vl[0].status: - # In case something failed during creation of UCI but actual storage volume was created and yet - # UCI state remained as 'new', try to remedy this by updating UCI state here - if ( store.status == None ) and ( store.volume_id != None ): - uci.state = vl[0].status - uci.flush() - - store.status = vl[0].status - store.flush() - if store.i_id != vl[0].instance_id: - store.i_id = vl[0].instance_id - store.flush() - if store.attach_time != vl[0].attach_time: - store.attach_time = vl[0].attach_time - store.flush() - if store.device != vl[0].device: - store.device = vl[0].device - store.flush() - + try: + if store.status != vl[0].status: + # In case something failed during creation of UCI but actual storage volume was created and yet + # UCI state remained as 'new', try to remedy this by updating UCI state here + if ( store.status == None ) and ( store.volume_id != None ): + uci.state = vl[0].status + uci.flush() + + store.status = vl[0].status + store.flush() + if store.i_id != vl[0].instance_id: + store.i_id = vl[0].instance_id + store.flush() + if store.attach_time != vl[0].attach_time: + store.attach_time = vl[0].attach_time + store.flush() + if store.device != vl[0].device: + store.device = vl[0].device + store.flush() + except boto.exception.EC2ResponseError, e: + log.error( "Updating status of volume(s) from cloud for UCI '%s' failed: " % ( uci.name, str(e) ) ) + uci.error( "Updating volume status from cloud failed: " + str(e) ) + uci.state( uci_states.ERROR ) + return None # def updateUCI( self, uci ): # """ # Runs a global status update on all storage volumes and all instances that are diff -r be631ed97541 -r e19eef93584f lib/galaxy/web/controllers/cloud.py --- a/lib/galaxy/web/controllers/cloud.py Thu Oct 29 11:46:43 2009 -0400 +++ b/lib/galaxy/web/controllers/cloud.py Thu Oct 29 17:40:31 2009 -0400 @@ -104,9 +104,9 @@ model.UCI.c.state==uci_states.SUBMITTED_UCI ) ) \ .all() if pendingInstances: - trans.set_message( "Galaxy instance started. NOTE: Please wait about 3-5 minutes for the instance to " - "start up and then refresh this page. A button to connect to the instance will then appear alongside " - "instance description." ) + trans.set_message( "Galaxy instance started. Note that it will take several minutes for the instance to start " + "(typically, 3-5 minutes). Once the instance is running and Galaxy is available, " + "a button to connect to the instance will then appear alongside instance description." ) # log.debug( "provider.is_secure: '%s'" % trans.sa_session.query( model.CloudProvider).filter_by(id=1).first().is_secure ) # trans.sa_session.query( model.CloudProvider).filter_by(id=1).first().is_secure=False @@ -151,17 +151,7 @@ stores = get_stores( trans, uci ) # Ensure instance is not already running (or related state) and store relevant data # into DB to initiate instance startup by cloud manager - if ( len(stores) is not 0 ) and \ - ( uci.state != uci_states.SUBMITTED ) and \ - ( uci.state != uci_states.SUBMITTED_UCI ) and \ - ( uci.state != uci_states.PENDING ) and \ - ( uci.state != uci_states.DELETING ) and \ - ( uci.state != uci_states.DELETING_UCI ) and \ - ( uci.state != uci_states.DELETED ) and \ - ( uci.state != uci_states.RUNNING ) and \ - ( uci.state != uci_states.NEW_UCI ) and \ - ( uci.state != uci_states.NEW ) and \ - ( uci.state != uci_states.ERROR ): + if ( len(stores) is not 0 ) and ( uci.state == uci_states.AVAILABLE ): instance = model.CloudInstance() instance.user = user instance.image = mi @@ -175,7 +165,7 @@ session.save_or_update( uci ) session.flush() # Log - trans.log_event ("User initiated starting of cloud instance '%s'." % uci.name ) + trans.log_event ("User initiated starting of UCI '%s'." % uci.name ) trans.set_message( "Galaxy instance started. NOTE: Please wait about 3-5 minutes for the instance to " "start up and then refresh this page. A button to connect to the instance will then appear alongside " "instance description." ) @@ -268,14 +258,20 @@ storedCreds = trans.sa_session.query( model.CloudUserCredentials ).filter_by( user=user ).all() if len( storedCreds ) == 0: return trans.show_error_message( "You must register credentials before configuring a Galaxy instance." ) + # TODO: This should be filled automatically but ties to implementation for diff provider is a problem... # Create dict mapping of cloud providers to zones available by those providers providersToZones = {} for storedCred in storedCreds: - if storedCred.provider.type == 'ec2': + if storedCred.provider.region_name == 'us-east-1': ec2_zones = ['us-east-1a', 'us-east-1b', 'us-east-1c', 'us-east-1d'] providersToZones[storedCred.name] = ec2_zones + elif storedCred.provider.region_name == 'eu-west-1': + ec2_zones = ['eu-west-1a', 'eu-west-1b'] + providersToZones[storedCred.name] = ec2_zones elif storedCred.provider.type == 'eucalyptus': providersToZones[storedCred.name] = ['epc'] + else: + providersToZones[storedCred.name] = ['Unknown provider zone'] if instanceName: # Create new user configured instance diff -r be631ed97541 -r e19eef93584f templates/cloud/add_provider.mako --- a/templates/cloud/add_provider.mako Thu Oct 29 11:46:43 2009 -0400 +++ b/templates/cloud/add_provider.mako Thu Oct 29 17:40:31 2009 -0400 @@ -117,9 +117,6 @@ <div id="region_selection" class="form-row-input"> <input type="text" name="region_name" id="region_name" value="${region_name}" size="40"> </div> - %if error.has_key('name_error'): - <div class="form-row-error-message">${error['name_error']}</div> - %endif <div style="clear: both"></div> </div> diff -r be631ed97541 -r e19eef93584f templates/cloud/configure_cloud.mako --- a/templates/cloud/configure_cloud.mako Thu Oct 29 11:46:43 2009 -0400 +++ b/templates/cloud/configure_cloud.mako Thu Oct 29 17:40:31 2009 -0400 @@ -33,6 +33,11 @@ if ( old_state=='pending' && new_state=='running' ) { location.reload(true); } + else if ( ( old_state=='running' && new_state=='error' ) || ( old_state=='pending' && new_state=='error' ) || \ + ( old_state=='submitted' && new_state=='error' ) || ( old_state=='submittedUCI' && new_state=='error' ) || \ + ( old_state=='shutting-down' && new_state=='error' ) ) { + location.reload(true); + } else if ( old_state=='shutting-down' && new_state=='available' ) { location.reload(true); } @@ -244,14 +249,14 @@ %if state =='error': <div id="short"> <a onclick="document.getElementById('full').style.display = 'block'; - document.getElementById('short').style.display = 'none'; return 0" + document.getElementById('short').style.display = 'none'; return 0" href="javascript:void(0)"> error </a> </div> <div id="full" style="DISPLAY: none"> <a onclick="document.getElementById('short').style.display = 'block'; - document.getElementById('full').style.display = 'none'; return 0;" + document.getElementById('full').style.display = 'none'; return 0;" href="javascript:void(0)"> error:</a><br /> ${str(prevInstance.error)} diff -r be631ed97541 -r e19eef93584f templates/cloud/configure_uci.mako --- a/templates/cloud/configure_uci.mako Thu Oct 29 11:46:43 2009 -0400 +++ b/templates/cloud/configure_uci.mako Thu Oct 29 17:40:31 2009 -0400 @@ -81,7 +81,7 @@ cls += " form-row-error" %> <div class="${cls}"> - <label>Permanent storage size (1-1000GB)<br/>NOTE: you will be able to add more storage later:</label> + <label>Permanent storage size (1-1000GB):<br/>(Note: you will be able to add more storage later)</label> <div class="form-row-input"> <input type="text" name="volSize" value="${volSize}" size="40"> </div> @@ -97,7 +97,7 @@ cls += " form-row-error" %> <div class="${cls}"> - <label>Zone to create storage in</label> + <label>Zone to create storage in:</label> <div class="form-row-input"> <select id="zones" name="zone" style="width:40em"> </select> diff -r be631ed97541 -r e19eef93584f templates/cloud/viewInstance.mako --- a/templates/cloud/viewInstance.mako Thu Oct 29 11:46:43 2009 -0400 +++ b/templates/cloud/viewInstance.mako Thu Oct 29 17:40:31 2009 -0400 @@ -83,12 +83,15 @@ </tr> <tr> <td> Public DNS:</td> - <td> ${liveInstance.public_dns} </td> + <% + lnk="http://"+str(liveInstance.public_dns) + %> + <td> <a href="${lnk}" target="_blank">${liveInstance.public_dns}</a></td> </tr> %if liveInstance.private_dns != None: <tr> <td> Private DNS:</td> - <td> ${liveInstance.private_dns} </td> + <td> ${liveInstance.private_dns}</td> </tr> %endif %if liveInstance.availability_zone != None: