Fix shutdown bug.

This commit is contained in:
Dennis Fetterly 2013-07-26 10:37:15 -07:00
parent 78c9aabd01
commit 111604c5c3
7 changed files with 53 additions and 33 deletions

View File

@ -459,7 +459,7 @@ namespace Microsoft.Research.Dryad
void ISchedulerHelper.FinishJob() void ISchedulerHelper.FinishJob()
{ {
m_appMaster.Finish();
} }
string ISchedulerHelper.GetVertexServiceBaseAddress(string nodename, int instanceId) string ISchedulerHelper.GetVertexServiceBaseAddress(string nodename, int instanceId)
@ -535,6 +535,8 @@ namespace Microsoft.Research.Dryad
} }
} }
m_appMaster.Finish(true);
if (wait) if (wait)
{ {
try try
@ -619,6 +621,7 @@ namespace Microsoft.Research.Dryad
DryadLogger.LogInformation("QueueYarnUpdate", "Task {0} on node {2} is in state {3}", taskId, nodeName, DryadLogger.LogInformation("QueueYarnUpdate", "Task {0} on node {2} is in state {3}", taskId, nodeName,
taskState); taskState);
// Set change event arguments // Set change event arguments
YarnTaskState yTaskState = (YarnTaskState)taskState; YarnTaskState yTaskState = (YarnTaskState)taskState;
VertexTask v = new VertexTask(taskId, nodeName, yTaskState, int.MaxValue, DateTime.UtcNow); VertexTask v = new VertexTask(taskId, nodeName, yTaskState, int.MaxValue, DateTime.UtcNow);
m_taskUpdateQueue.Add(v); m_taskUpdateQueue.Add(v);

View File

@ -62,12 +62,12 @@ namespace Microsoft { namespace Research { namespace Dryad { namespace YarnBridg
} }
} }
void AMInstance::Finish() void AMInstance::Finish(bool success)
{ {
if (m_instance != IntPtr::Zero) if (m_instance != IntPtr::Zero)
{ {
AMNativeInstance *instance = (AMNativeInstance *) m_instance.ToPointer(); AMNativeInstance *instance = (AMNativeInstance *) m_instance.ToPointer();
instance->Shutdown(); instance->Shutdown(success);
} }
} }

View File

@ -37,7 +37,7 @@ namespace Microsoft { namespace Research { namespace Dryad { namespace YarnBridg
void Close(); void Close();
void Finish(); void Finish(bool success);
int GetHealthyNodeCount(); int GetHealthyNodeCount();

View File

@ -246,8 +246,8 @@ namespace DryadYarn
return false; return false;
} }
jmethodID midShutdown = m_env->e->GetMethodID(clsDryadAppMaster, "shutdown", "()V"); jmethodID midShutdown = m_env->e->GetMethodID(clsDryadAppMaster, "shutdown", "(ZZ)V");
if (midSchProc == NULL) if (midShutdown == NULL)
{ {
jthrowable exc; jthrowable exc;
exc = m_env->e->ExceptionOccurred(); exc = m_env->e->ExceptionOccurred();
@ -288,7 +288,6 @@ namespace DryadYarn
jstring jName = env->NewStringUTF(name); jstring jName = env->NewStringUTF(name);
jstring jCmdLine = env->NewStringUTF(commandLine); jstring jCmdLine = env->NewStringUTF(commandLine);
env->CallVoidMethod(m_inst->m_obj, m_inst->m_midSchProc, vertexId, jName, jCmdLine); env->CallVoidMethod(m_inst->m_obj, m_inst->m_midSchProc, vertexId, jName, jCmdLine);
env->DeleteLocalRef(jName); env->DeleteLocalRef(jName);
@ -299,16 +298,25 @@ namespace DryadYarn
return true; return true;
} }
bool AMNativeInstance::Shutdown() bool AMNativeInstance::Shutdown(bool success)
{ {
fprintf(stderr, "Shutting down AMNativeInstance\n"); fprintf(stderr, "Shutting down AMNativeInstance\n");
fflush(stderr); fflush(stderr);
JNIEnv* env = AttachToJvm(); JNIEnv* env = AttachToJvm();
fprintf(stderr, "Calling Shutdown\n");
fflush(stderr);
env->CallVoidMethod(m_inst->m_obj, m_inst->m_midShutdown); jboolean jImmedShutdown = 0;
jboolean jSuccess = 0;
if (success)
{
jSuccess = 1;
}
env->CallVoidMethod(m_inst->m_obj, m_inst->m_midShutdown, jImmedShutdown, jSuccess);
// detach here? // detach here?
fprintf(stderr, "Finished Shutdown\n");
fflush(stderr);
return true; return true;
} }

View File

@ -70,7 +70,7 @@ namespace DryadYarn
char* GetExceptionMessage(); char* GetExceptionMessage();
bool ScheduleProcess(int vertexId, const char* name, const char* commandLine); bool ScheduleProcess(int vertexId, const char* name, const char* commandLine);
bool Shutdown(); bool Shutdown(bool success);
private: private:
//void* operator new( size_t ); //void* operator new( size_t );

View File

@ -339,11 +339,11 @@ DrString DrHdfsOutputStream::GetURIForWrite(int partitionIndex,
DrMetaDataRef /*metaData */) DrMetaDataRef /*metaData */)
{ {
DrAssert(m_hdfsInstance != DrNull); DrAssert(m_hdfsInstance != DrNull);
DrString fileName; String^ fileName = m_baseUri + "-tmp/part-" + partitionIndex.ToString("D8") + "." + version;
fileName.Set(m_baseUri);
//String^ fileName = m_baseUri + "-tmp/part-" + partitionIndex.ToString("D8") + "." + version; //DrLogI("HDFS GetURIForWrite returning '%s'", fileName); // DCF HDFS debug
fileName.AppendF("-tmp/part-%8d.%d", partitionIndex, version);
return fileName; return DrString(fileName);
} }
void DrHdfsOutputStream::DiscardUnusedPartition(int partitionIndex, void DrHdfsOutputStream::DiscardUnusedPartition(int partitionIndex,

View File

@ -134,7 +134,7 @@ public class DryadAppMaster
{ {
log = LogFactory.getLog("DryadAppMaster"); log = LogFactory.getLog("DryadAppMaster");
log.info("In DryadAppMaster constructor"); log.info("In DryadAppMaster constructor");
shuttingDown = new AtomicBoolean(); shuttingDown = new AtomicBoolean(false);
scheduleProcesses = new AtomicBoolean(true); scheduleProcesses = new AtomicBoolean(true);
responseId = new AtomicInteger(); responseId = new AtomicInteger();
nextVertexId = new AtomicInteger(2); //first vertex id is 2 to map to Dryad Vertex Scheduler nextVertexId = new AtomicInteger(2); //first vertex id is 2 to map to Dryad Vertex Scheduler
@ -233,8 +233,9 @@ public class DryadAppMaster
{ {
// check to see if we should cancel the heartbeat // check to see if we should cancel the heartbeat
if (shuttingDown.get()) { if (shuttingDown.get()) {
log.info("Cancelling heartbeat");
heartbeatHandle.cancel(true); heartbeatHandle.cancel(true);
} } else {
log.info("Sending heartbeat to the RM"); log.info("Sending heartbeat to the RM");
AllocateResponse response = sendAllocateRequest(); AllocateResponse response = sendAllocateRequest();
if (response != null) { if (response != null) {
@ -245,7 +246,7 @@ public class DryadAppMaster
} }
processResponse(response); processResponse(response);
} }
}
} }
private void launchContainer(Container container, ContainerManager cm) private void launchContainer(Container container, ContainerManager cm)
@ -317,7 +318,9 @@ public class DryadAppMaster
{ {
// is this the first allocation? // is this the first allocation?
if (scheduleProcesses.compareAndSet(true, false)) { if (scheduleProcesses.compareAndSet(true, false)) {
int numProcessesToStart = Math.max(response.getNumClusterNodes() - 1, maxNodes); //don't schedule a process where the graph manager is running //don't schedule a process where the graph manager is running
int numProcessesToStart = Math.max(response.getNumClusterNodes() - 1, maxNodes);
log.info("There are " + response.getNumClusterNodes() + " nodes in the cluster. maxNodes = " + maxNodes);
scheduleProcess(numProcessesToStart); scheduleProcess(numProcessesToStart);
} }
@ -339,7 +342,8 @@ public class DryadAppMaster
// Need to notify graph manager of current state // Need to notify graph manager of current state
VertexInfo vi = runningContainers.remove(cid); VertexInfo vi = runningContainers.remove(cid);
if (vi != null) { //only send events up the stack when we are not shutting down
if (vi != null && !shuttingDown.get()) {
int containerState = 0; int containerState = 0;
if (containerStatus.getState() == ContainerState.COMPLETE) { if (containerStatus.getState() == ContainerState.COMPLETE) {
if (containerStatus.getExitStatus() == 0) { if (containerStatus.getExitStatus() == 0) {
@ -426,7 +430,7 @@ public class DryadAppMaster
} }
} }
public void shutdown(boolean immediateShutdown) public void shutdown(boolean immediateShutdown, boolean success)
{ {
shuttingDown.set(true); shuttingDown.set(true);
heartbeatHandle.cancel(immediateShutdown); // if we are shutting down, we can just interrupt the running thread, if necessary heartbeatHandle.cancel(immediateShutdown); // if we are shutting down, we can just interrupt the running thread, if necessary
@ -435,13 +439,18 @@ public class DryadAppMaster
// send the shutdown message to the RM // send the shutdown message to the RM
FinishApplicationMasterRequest request = Records.newRecord(FinishApplicationMasterRequest.class); FinishApplicationMasterRequest request = Records.newRecord(FinishApplicationMasterRequest.class);
request.setAppAttemptId(appAttemptID); request.setAppAttemptId(appAttemptID);
request.setFinishApplicationStatus(FinalApplicationStatus.SUCCEEDED); // NYI - determine success if (success) {
request.setFinishApplicationStatus(FinalApplicationStatus.SUCCEEDED);
} else {
request.setFinishApplicationStatus(FinalApplicationStatus.FAILED);
}
try { try {
//response is currently an empty class //response is currently an empty class
FinishApplicationMasterResponse response = resourceManager. finishApplicationMaster(request); FinishApplicationMasterResponse response = resourceManager. finishApplicationMaster(request);
} catch (YarnRemoteException|IOException e) { } catch (YarnRemoteException|IOException e) {
log.error("Error communicating with RM: " + e.getMessage() , e); log.error("Error communicating with RM: " + e.getMessage() , e);
} }
log.info("FinishApplicationMasterRequest sent");
} }
private void startContainers(List<Container> newContainers) private void startContainers(List<Container> newContainers)