Fix shutdown bug.
This commit is contained in:
parent
78c9aabd01
commit
111604c5c3
|
|
@ -459,7 +459,7 @@ namespace Microsoft.Research.Dryad
|
||||||
|
|
||||||
void ISchedulerHelper.FinishJob()
|
void ISchedulerHelper.FinishJob()
|
||||||
{
|
{
|
||||||
m_appMaster.Finish();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string ISchedulerHelper.GetVertexServiceBaseAddress(string nodename, int instanceId)
|
string ISchedulerHelper.GetVertexServiceBaseAddress(string nodename, int instanceId)
|
||||||
|
|
@ -535,6 +535,8 @@ namespace Microsoft.Research.Dryad
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_appMaster.Finish(true);
|
||||||
|
|
||||||
if (wait)
|
if (wait)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
|
|
@ -619,6 +621,7 @@ namespace Microsoft.Research.Dryad
|
||||||
DryadLogger.LogInformation("QueueYarnUpdate", "Task {0} on node {2} is in state {3}", taskId, nodeName,
|
DryadLogger.LogInformation("QueueYarnUpdate", "Task {0} on node {2} is in state {3}", taskId, nodeName,
|
||||||
taskState);
|
taskState);
|
||||||
// Set change event arguments
|
// Set change event arguments
|
||||||
|
|
||||||
YarnTaskState yTaskState = (YarnTaskState)taskState;
|
YarnTaskState yTaskState = (YarnTaskState)taskState;
|
||||||
VertexTask v = new VertexTask(taskId, nodeName, yTaskState, int.MaxValue, DateTime.UtcNow);
|
VertexTask v = new VertexTask(taskId, nodeName, yTaskState, int.MaxValue, DateTime.UtcNow);
|
||||||
m_taskUpdateQueue.Add(v);
|
m_taskUpdateQueue.Add(v);
|
||||||
|
|
|
||||||
|
|
@ -62,12 +62,12 @@ namespace Microsoft { namespace Research { namespace Dryad { namespace YarnBridg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AMInstance::Finish()
|
void AMInstance::Finish(bool success)
|
||||||
{
|
{
|
||||||
if (m_instance != IntPtr::Zero)
|
if (m_instance != IntPtr::Zero)
|
||||||
{
|
{
|
||||||
AMNativeInstance *instance = (AMNativeInstance *) m_instance.ToPointer();
|
AMNativeInstance *instance = (AMNativeInstance *) m_instance.ToPointer();
|
||||||
instance->Shutdown();
|
instance->Shutdown(success);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ namespace Microsoft { namespace Research { namespace Dryad { namespace YarnBridg
|
||||||
|
|
||||||
|
|
||||||
void Close();
|
void Close();
|
||||||
void Finish();
|
void Finish(bool success);
|
||||||
|
|
||||||
int GetHealthyNodeCount();
|
int GetHealthyNodeCount();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -246,8 +246,8 @@ namespace DryadYarn
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
jmethodID midShutdown = m_env->e->GetMethodID(clsDryadAppMaster, "shutdown", "()V");
|
jmethodID midShutdown = m_env->e->GetMethodID(clsDryadAppMaster, "shutdown", "(ZZ)V");
|
||||||
if (midSchProc == NULL)
|
if (midShutdown == NULL)
|
||||||
{
|
{
|
||||||
jthrowable exc;
|
jthrowable exc;
|
||||||
exc = m_env->e->ExceptionOccurred();
|
exc = m_env->e->ExceptionOccurred();
|
||||||
|
|
@ -288,7 +288,6 @@ namespace DryadYarn
|
||||||
|
|
||||||
jstring jName = env->NewStringUTF(name);
|
jstring jName = env->NewStringUTF(name);
|
||||||
jstring jCmdLine = env->NewStringUTF(commandLine);
|
jstring jCmdLine = env->NewStringUTF(commandLine);
|
||||||
|
|
||||||
env->CallVoidMethod(m_inst->m_obj, m_inst->m_midSchProc, vertexId, jName, jCmdLine);
|
env->CallVoidMethod(m_inst->m_obj, m_inst->m_midSchProc, vertexId, jName, jCmdLine);
|
||||||
|
|
||||||
env->DeleteLocalRef(jName);
|
env->DeleteLocalRef(jName);
|
||||||
|
|
@ -299,16 +298,25 @@ namespace DryadYarn
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AMNativeInstance::Shutdown()
|
bool AMNativeInstance::Shutdown(bool success)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Shutting down AMNativeInstance\n");
|
fprintf(stderr, "Shutting down AMNativeInstance\n");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
JNIEnv* env = AttachToJvm();
|
JNIEnv* env = AttachToJvm();
|
||||||
|
fprintf(stderr, "Calling Shutdown\n");
|
||||||
env->CallVoidMethod(m_inst->m_obj, m_inst->m_midShutdown);
|
fflush(stderr);
|
||||||
|
|
||||||
|
jboolean jImmedShutdown = 0;
|
||||||
|
jboolean jSuccess = 0;
|
||||||
|
if (success)
|
||||||
|
{
|
||||||
|
jSuccess = 1;
|
||||||
|
}
|
||||||
|
env->CallVoidMethod(m_inst->m_obj, m_inst->m_midShutdown, jImmedShutdown, jSuccess);
|
||||||
|
|
||||||
// detach here?
|
// detach here?
|
||||||
|
fprintf(stderr, "Finished Shutdown\n");
|
||||||
|
fflush(stderr);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,7 @@ namespace DryadYarn
|
||||||
char* GetExceptionMessage();
|
char* GetExceptionMessage();
|
||||||
|
|
||||||
bool ScheduleProcess(int vertexId, const char* name, const char* commandLine);
|
bool ScheduleProcess(int vertexId, const char* name, const char* commandLine);
|
||||||
bool Shutdown();
|
bool Shutdown(bool success);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
//void* operator new( size_t );
|
//void* operator new( size_t );
|
||||||
|
|
|
||||||
|
|
@ -339,11 +339,11 @@ DrString DrHdfsOutputStream::GetURIForWrite(int partitionIndex,
|
||||||
DrMetaDataRef /*metaData */)
|
DrMetaDataRef /*metaData */)
|
||||||
{
|
{
|
||||||
DrAssert(m_hdfsInstance != DrNull);
|
DrAssert(m_hdfsInstance != DrNull);
|
||||||
DrString fileName;
|
String^ fileName = m_baseUri + "-tmp/part-" + partitionIndex.ToString("D8") + "." + version;
|
||||||
fileName.Set(m_baseUri);
|
|
||||||
//String^ fileName = m_baseUri + "-tmp/part-" + partitionIndex.ToString("D8") + "." + version;
|
//DrLogI("HDFS GetURIForWrite returning '%s'", fileName); // DCF HDFS debug
|
||||||
fileName.AppendF("-tmp/part-%8d.%d", partitionIndex, version);
|
|
||||||
return fileName;
|
return DrString(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DrHdfsOutputStream::DiscardUnusedPartition(int partitionIndex,
|
void DrHdfsOutputStream::DiscardUnusedPartition(int partitionIndex,
|
||||||
|
|
|
||||||
|
|
@ -134,7 +134,7 @@ public class DryadAppMaster
|
||||||
{
|
{
|
||||||
log = LogFactory.getLog("DryadAppMaster");
|
log = LogFactory.getLog("DryadAppMaster");
|
||||||
log.info("In DryadAppMaster constructor");
|
log.info("In DryadAppMaster constructor");
|
||||||
shuttingDown = new AtomicBoolean();
|
shuttingDown = new AtomicBoolean(false);
|
||||||
scheduleProcesses = new AtomicBoolean(true);
|
scheduleProcesses = new AtomicBoolean(true);
|
||||||
responseId = new AtomicInteger();
|
responseId = new AtomicInteger();
|
||||||
nextVertexId = new AtomicInteger(2); //first vertex id is 2 to map to Dryad Vertex Scheduler
|
nextVertexId = new AtomicInteger(2); //first vertex id is 2 to map to Dryad Vertex Scheduler
|
||||||
|
|
@ -233,19 +233,20 @@ public class DryadAppMaster
|
||||||
{
|
{
|
||||||
// check to see if we should cancel the heartbeat
|
// check to see if we should cancel the heartbeat
|
||||||
if (shuttingDown.get()) {
|
if (shuttingDown.get()) {
|
||||||
|
log.info("Cancelling heartbeat");
|
||||||
heartbeatHandle.cancel(true);
|
heartbeatHandle.cancel(true);
|
||||||
}
|
} else {
|
||||||
log.info("Sending heartbeat to the RM");
|
log.info("Sending heartbeat to the RM");
|
||||||
AllocateResponse response = sendAllocateRequest();
|
AllocateResponse response = sendAllocateRequest();
|
||||||
if (response != null) {
|
if (response != null) {
|
||||||
int oldNodeCount = clusterNodeCount;
|
int oldNodeCount = clusterNodeCount;
|
||||||
clusterNodeCount = response.getNumClusterNodes();
|
clusterNodeCount = response.getNumClusterNodes();
|
||||||
if (clusterNodeCount != oldNodeCount) {
|
if (clusterNodeCount != oldNodeCount) {
|
||||||
log.info("There are now " + clusterNodeCount + " available nodes on the cluster.");
|
log.info("There are now " + clusterNodeCount + " available nodes on the cluster.");
|
||||||
|
}
|
||||||
|
processResponse(response);
|
||||||
}
|
}
|
||||||
processResponse(response);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void launchContainer(Container container, ContainerManager cm)
|
private void launchContainer(Container container, ContainerManager cm)
|
||||||
|
|
@ -317,7 +318,9 @@ public class DryadAppMaster
|
||||||
{
|
{
|
||||||
// is this the first allocation?
|
// is this the first allocation?
|
||||||
if (scheduleProcesses.compareAndSet(true, false)) {
|
if (scheduleProcesses.compareAndSet(true, false)) {
|
||||||
int numProcessesToStart = Math.max(response.getNumClusterNodes() - 1, maxNodes); //don't schedule a process where the graph manager is running
|
//don't schedule a process where the graph manager is running
|
||||||
|
int numProcessesToStart = Math.max(response.getNumClusterNodes() - 1, maxNodes);
|
||||||
|
log.info("There are " + response.getNumClusterNodes() + " nodes in the cluster. maxNodes = " + maxNodes);
|
||||||
scheduleProcess(numProcessesToStart);
|
scheduleProcess(numProcessesToStart);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -339,7 +342,8 @@ public class DryadAppMaster
|
||||||
|
|
||||||
// Need to notify graph manager of current state
|
// Need to notify graph manager of current state
|
||||||
VertexInfo vi = runningContainers.remove(cid);
|
VertexInfo vi = runningContainers.remove(cid);
|
||||||
if (vi != null) {
|
//only send events up the stack when we are not shutting down
|
||||||
|
if (vi != null && !shuttingDown.get()) {
|
||||||
int containerState = 0;
|
int containerState = 0;
|
||||||
if (containerStatus.getState() == ContainerState.COMPLETE) {
|
if (containerStatus.getState() == ContainerState.COMPLETE) {
|
||||||
if (containerStatus.getExitStatus() == 0) {
|
if (containerStatus.getExitStatus() == 0) {
|
||||||
|
|
@ -426,8 +430,8 @@ public class DryadAppMaster
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void shutdown(boolean immediateShutdown)
|
public void shutdown(boolean immediateShutdown, boolean success)
|
||||||
{
|
{
|
||||||
shuttingDown.set(true);
|
shuttingDown.set(true);
|
||||||
heartbeatHandle.cancel(immediateShutdown); // if we are shutting down, we can just interrupt the running thread, if necessary
|
heartbeatHandle.cancel(immediateShutdown); // if we are shutting down, we can just interrupt the running thread, if necessary
|
||||||
log.info("Shutdown heartbeats to RM");
|
log.info("Shutdown heartbeats to RM");
|
||||||
|
|
@ -435,13 +439,18 @@ public class DryadAppMaster
|
||||||
// send the shutdown message to the RM
|
// send the shutdown message to the RM
|
||||||
FinishApplicationMasterRequest request = Records.newRecord(FinishApplicationMasterRequest.class);
|
FinishApplicationMasterRequest request = Records.newRecord(FinishApplicationMasterRequest.class);
|
||||||
request.setAppAttemptId(appAttemptID);
|
request.setAppAttemptId(appAttemptID);
|
||||||
request.setFinishApplicationStatus(FinalApplicationStatus.SUCCEEDED); // NYI - determine success
|
if (success) {
|
||||||
|
request.setFinishApplicationStatus(FinalApplicationStatus.SUCCEEDED);
|
||||||
|
} else {
|
||||||
|
request.setFinishApplicationStatus(FinalApplicationStatus.FAILED);
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
//response is currently an empty class
|
//response is currently an empty class
|
||||||
FinishApplicationMasterResponse response = resourceManager. finishApplicationMaster(request);
|
FinishApplicationMasterResponse response = resourceManager. finishApplicationMaster(request);
|
||||||
} catch (YarnRemoteException|IOException e) {
|
} catch (YarnRemoteException|IOException e) {
|
||||||
log.error("Error communicating with RM: " + e.getMessage() , e);
|
log.error("Error communicating with RM: " + e.getMessage() , e);
|
||||||
}
|
}
|
||||||
|
log.info("FinishApplicationMasterRequest sent");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void startContainers(List<Container> newContainers)
|
private void startContainers(List<Container> newContainers)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue