/*
Copyright (c) Microsoft Corporation
All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License
at http://www.apache.org/licenses/LICENSE-2.0
THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
See the Apache Version 2.0 License for specific language governing permissions and
limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Research.Peloponnese.NotHttpClient;
namespace Microsoft.Research.Dryad.ClusterInterface
{
///
/// This is the container class for a process once it has been scheduled
///
internal class Process : IProcess
{
///
/// internal state to keep track of what we have told the higher level,
/// and debug bad state transitions
///
private enum State
{
Initializing,
Queued,
Matched,
Created,
Started,
Exited
}
///
/// internal state to keep track of what we have told the higher level,
/// and debug bad state transitions
///
private State state;
///
/// string used to start the remote process
///
private string commandLine;
///
/// this is the handle that the scheduler supplies that is used to refer
/// to the process
///
private ISchedulerProcess schedulerProcess;
///
/// task to start when the process is canceled
///
private TaskCompletionSource cancelTask;
///
/// this is the object passed down by higher levels of the software stack,
/// that receives updates as the process is queued, matched, scheduled, run, etc.
///
private IProcessWatcher watcher;
///
/// this is the local directory where the process writes its outputs
///
private string directory;
///
/// this is the computer we are running on, once we get scheduled
///
private IComputer computer;
///
/// this task is started when the owning computer is shutting down
///
private Task computerCancellation;
///
/// statusVersion is the version number associated with the process at the remote
/// web server, which is incremented every time the process' state changes, e.g.
/// when it starts running or exits
///
private UInt64 statusVersion;
///
/// statusString is the status associated with the process at the remote web
/// server, which can be Queued, Running, Canceling or Completed
///
private string statusString;
///
/// the interface to the application's logging
///
private ILogger logger;
///
/// construct a new object to represent the lifecycle of a process being scheduled
///
public Process(ISchedulerProcess p, IProcessWatcher w, string cmd, ILogger l)
{
schedulerProcess = p;
state = State.Initializing;
commandLine = cmd;
watcher = w;
cancelTask = new TaskCompletionSource();
directory = null;
statusVersion = 0;
statusString = "";
logger = l;
}
public ISchedulerProcess SchedulerProcess { get { return schedulerProcess; } }
///
/// a unique GUID representing the process for logging purposes
///
public string Id { get { return schedulerProcess.Id; } }
///
/// the string used to start the remote process
///
public string CommandLine { get { return commandLine; } }
///
/// the local directory of the process at the daemon's host computer
///
public string Directory { get { return directory; } }
///
/// set the computer where the process is running
///
private void SetComputer(IComputer remote, Task remoteCancel, string suffix)
{
// use a lock here because computer can be accessed by other threads trying
// to get process keys
lock (this)
{
computer = remote;
computerCancellation = remoteCancel;
directory = suffix;
}
}
private async Task PostRequest(IComputer computer, string requestString, byte[] payload)
{
string uri = computer.ProcessServer + requestString;
IHttpRequest request = HttpClient.Create(uri);
request.Timeout = 30 * 1000; // this should come back quickly. If it really takes a long time, something is wrong
request.Method = "POST";
try
{
using (Stream upload = request.GetRequestStream())
{
await upload.WriteAsync(payload, 0, payload.Length);
}
using (IHttpResponse response = await request.GetResponseAsync())
{
// this succeeded but we don't care about the response: null indicates no error
return null;
}
}
catch (NotHttpException e)
{
string error = "Post " + uri + " failed message " + e.Message + " status " + e.Response.StatusCode + ": " + e.Response.StatusDescription;
logger.Log(error);
return error;
}
catch (Exception e)
{
string error = "Post " + uri + " failed message " + e.Message;
logger.Log(error);
return error;
}
}
private async Task Schedule(IComputer computer, Task interrupt)
{
logger.Log("Process " + Id + " scheduling itself as " + directory + " on computer " + computer.Name + " at " + computer.Host);
ToMatched(computer, DateTime.Now.ToFileTimeUtc());
Task bail = interrupt.ContinueWith((t) => "");
Task upload = PostRequest(computer, directory + "?op=create", Encoding.UTF8.GetBytes(commandLine));
Task completed = await Task.WhenAny(bail, upload);
if (completed == bail)
{
logger.Log("Process " + Id + " abandoned creation due to finishWaiter");
ToExited(ProcessExitState.ScheduleFailed, DateTime.Now.ToFileTimeUtc(), 1, "Service shut down while scheduling");
return false;
}
if (completed.Result == null)
{
logger.Log("Process " + Id + " got remote create process success for " + directory);
ToCreated(DateTime.Now.ToFileTimeUtc());
return true;
}
else
{
logger.Log("Proces " + Id + " got remote create process failure " + completed.Result);
ToExited(ProcessExitState.ScheduleFailed, DateTime.Now.ToFileTimeUtc(), 1, completed.Result);
return false;
}
}
private bool UpdateProcessStatus(IHttpResponse response)
{
try
{
UInt64 newVersion = UInt64.Parse(response.Headers["X-Dryad-ValueVersion"]);
string status = response.Headers["X-Dryad-ProcessStatus"];
int exitCode = Int32.Parse(response.Headers["X-Dryad-ProcessExitCode"]);
long startTime = Int64.Parse(response.Headers["X-Dryad-ProcessStartTime"]);
long endTime = Int64.Parse(response.Headers["X-Dryad-ProcessEndTime"]);
statusVersion = newVersion;
if (status != statusString)
{
statusString = status;
if (status == "Queued")
{
// don't bother to record this 'transition'
logger.Log("Process " + Id + " got Queued status");
}
else if (status == "Running")
{
logger.Log("Process " + Id + " got Running status");
ToStarted(startTime);
}
else if (status == "Completed")
{
logger.Log("Process " + Id + " got Completed status");
ToExited(ProcessExitState.ProcessExited, endTime, exitCode, "Process exit detected normally");
return true;
}
else
{
logger.Log("Process " + Id + " got unknown status " + status);
}
}
}
catch (Exception e)
{
logger.Log("Process " + Id + " got exception " + e.ToString() + " parsing status");
// we failed to read the headers correctly, which is odd, but we'll assume the process is now dead
ToExited(ProcessExitState.StatusFailed, DateTime.Now.ToFileTimeUtc(), 1, "Failed to read headers " + e.Message);
return true;
}
return false;
}
private async Task GetStatus(IComputer computer, Task interrupt)
{
logger.Log("Process " + Id + " getting status on computer " + computer.Name + " at " + computer.Host);
// use a 2 minute heartbeat for now
int timeout = 120000;
StringBuilder sb = new StringBuilder(directory);
sb.AppendFormat("?version={0}", statusVersion);
sb.AppendFormat("&timeout={0}", timeout);
Task completed;
try
{
IHttpRequest request = HttpClient.Create(computer.ProcessServer + sb.ToString());
request.Timeout = timeout + 30000;
Task bail = interrupt.ContinueWith((t) => null as IHttpResponse);
completed = await Task.WhenAny(bail, request.GetResponseAsync());
if (completed == bail)
{
logger.Log("Process " + Id + " abandoned status due to finishWaiter");
ToExited(ProcessExitState.StatusFailed, DateTime.Now.ToFileTimeUtc(), 1, "Service stopped while waiting for status");
return true;
}
}
catch (NotHttpException e)
{
string error = "Status fetch failed message " + e.Message + " status " + e.Response.StatusCode + ": " + e.Response.StatusDescription;
logger.Log("Process " + Id + " got remote process status failure " + error);
ToExited(ProcessExitState.StatusFailed, DateTime.Now.ToFileTimeUtc(), 1, error);
return true;
}
catch (Exception e)
{
string error = "Status fetch failed message " + e.Message;
logger.Log("Process " + Id + " got remote process status failure " + error);
ToExited(ProcessExitState.StatusFailed, DateTime.Now.ToFileTimeUtc(), 1, error);
return true;
}
using (IHttpResponse response = completed.Result)
{
try
{
// read the empty payload to the end to keep the protocol happy
using (Stream payloadStream = response.GetResponseStream())
{
}
}
catch (NotHttpException e)
{
string error = "Status fetch failed message " + e.Message + " status " + e.Response.StatusCode + ": " + e.Response.StatusDescription;
logger.Log("Process " + Id + " got remote process status failure " + error);
ToExited(ProcessExitState.StatusFailed, DateTime.Now.ToFileTimeUtc(), 1, error);
return true;
}
catch (Exception e)
{
string error = "Status fetch failed message " + e.Message;
logger.Log("Process " + Id + " got remote process status failure " + error);
ToExited(ProcessExitState.StatusFailed, DateTime.Now.ToFileTimeUtc(), 1, error);
return true;
}
return UpdateProcessStatus(response);
}
}
public async Task Kill(IComputer computer, Task interrupt)
{
logger.Log("Process " + Id + " sending remote kill to computer " + computer.Name + " on host " + computer.Host);
Task bail = interrupt.ContinueWith((t) => "");
Task upload = PostRequest(computer, directory + "?op=kill", new byte[0]);
Task completed = await Task.WhenAny(bail, upload);
if (completed == bail)
{
logger.Log("Process " + Id + " abandoned kill due to finishWaiter");
return;
}
if (completed.Result == null)
{
logger.Log("Process " + Id + " got successful response for kill");
}
else
{
// if this failed, there's nothing much more we can do
logger.Log("Process " + Id + " got failure response for kill " + completed.Result);
}
}
private Task AsyncCancelTask
{
get { return cancelTask.Task.ContinueWith((t) => { }); }
}
public async Task Run(IComputer computer, int processId, Task computerInterrupt, string errorReason)
{
if (errorReason != null)
{
ToExited(ProcessExitState.ScheduleFailed, DateTime.Now.ToFileTimeUtc(), 1, errorReason);
return;
}
// get a unique id for this process on this computer, and store the identifying
// suffix that we will use to refer to it
SetComputer(computer, computerInterrupt, processId.ToString());
logger.Log("Process " + Id + " matched to computer " + computer.Name + " on " + computer.Host);
{
Task interrupt = Task.WhenAny(computerInterrupt, AsyncCancelTask);
bool exited = !(await Schedule(computer, interrupt));
while (!exited)
{
logger.Log("Process " + Id + " getting status from " + computer.Name + " on " + computer.Host);
exited = await GetStatus(computer, interrupt);
}
}
logger.Log("Process " + Id + " ensuring it is killed at " + computer.Name + " on " + computer.Host);
// we shouldn't get here until the process has exited unless we got a cancellation, but just for belt
// and braces we'll always try to make sure it's really dead at the other end
await Kill(computer, computerInterrupt);
logger.Log("Process " + Id + " finished running at " + computer.Name + " on " + computer.Host);
}
public void Cancel()
{
cancelTask.SetResult(true);
}
public async Task GetKeyStatus(IProcessKeyStatus status)
{
logger.Log("Process " + Id + " sending key/value fetch for " +
status.GetKey() + ":" + status.GetVersion() + ":" + status.GetTimeout());
IComputer remote;
Task computerInterrupt;
lock (this)
{
// use a lock to ensure memory safety since these are set on another thread
remote = computer;
computerInterrupt = computerCancellation;
}
Task interrupt = Task.WhenAny(computerInterrupt, AsyncCancelTask);
StringBuilder sb = new StringBuilder(directory);
sb.AppendFormat("?key={0}", status.GetKey());
sb.AppendFormat("&timeout={0}", status.GetTimeout());
sb.AppendFormat("&version={0}", status.GetVersion());
Task completed;
try
{
IHttpRequest request = HttpClient.Create(remote.ProcessServer + sb.ToString());
request.Timeout = status.GetTimeout() + 30000;
Task bail = interrupt.ContinueWith((t) => null as IHttpResponse);
completed = await Task.WhenAny(bail, request.GetResponseAsync());
if (completed == bail)
{
logger.Log("Process " + Id + " abandoned property fetch due to interrupt");
return;
}
}
catch (NotHttpException e)
{
string error = "Status fetch failed message " + e.Message + " status " + e.Response.StatusCode + ": " + e.Response.StatusDescription;
logger.Log("Process " + Id + " got remote property fetch failure from " + remote.Name +
" at " + remote.Host + ": " + error);
status.OnCompleted(0, null, 1, error);
return;
}
catch (Exception e)
{
string error = "Status fetch failed message " + e.ToString();
logger.Log("Process " + Id + " got remote property fetch failure from " + remote.Name +
" at " + remote.Host + ": " + error);
status.OnCompleted(0, null, 1, error);
return;
}
using (IHttpResponse response = completed.Result)
{
try
{
using (MemoryStream ms = new MemoryStream())
{
Task payload;
using (Stream payloadStream = response.GetResponseStream())
{
payload = await Task.WhenAny(interrupt, payloadStream.CopyToAsync(ms));
}
if (payload == interrupt)
{
logger.Log("Process " + Id + " abandoned property fetch due to interrupt");
return;
}
logger.Log("Process " + Id + " completed property fetch");
UInt64 newVersion = UInt64.Parse(response.Headers["X-Dryad-ValueVersion"]);
string stateString = response.Headers["X-Dryad-ProcessStatus"];
int exitCode = Int32.Parse(response.Headers["X-Dryad-ProcessExitCode"]);
logger.Log("Process " + Id + " property fetch: " + status.GetKey() + ":" + newVersion + ":" + stateString);
status.OnCompleted(newVersion, ms.ToArray(), exitCode, null);
}
}
catch (NotHttpException e)
{
string error = "Status fetch failed message " + e.Message + " status " + e.Response.StatusCode + ": " + e.Response.StatusDescription;
logger.Log("Process " + Id + " got remote property fetch failure from " + remote.Name +
" at " + remote.Host + ": " + error);
status.OnCompleted(0, null, 1, error);
}
catch (Exception e)
{
string error = "Header fetch failed message " + e.ToString() + " headers " + response.Headers;
logger.Log("Process " + Id + " got remote property fetch failure from " + remote.Name +
" at " + remote.Host + ": " + error);
status.OnCompleted(0, null, 1, error);
}
}
}
public async Task SetCommand(IProcessCommand command)
{
logger.Log("Process " + Id + " sending property command for " +
command.GetKey() + ":" + command.GetShortStatus());
IComputer remote;
Task computerInterrupt;
lock (this)
{
// use a lock to ensure memory safety since these are set on another thread
remote = computer;
computerInterrupt = computerCancellation;
}
Task interrupt = Task.WhenAny(computerInterrupt, AsyncCancelTask);
StringBuilder sb = new StringBuilder(directory);
sb.AppendFormat("?op=setstatus");
sb.AppendFormat("&key={0}", command.GetKey());
sb.AppendFormat("&shortstatus={0}", command.GetShortStatus());
sb.AppendFormat("¬ifywaiters=true");
Task bail = interrupt.ContinueWith((t) => "");
Task upload = PostRequest(remote, sb.ToString(), command.GetPayload());
Task completed = await Task.WhenAny(bail, upload);
if (completed == bail)
{
logger.Log("Process " + Id + " abandoned property command due to interrupt");
return;
}
if (completed.Result == null)
{
logger.Log("Process " + Id + " send property command succeeded");
command.OnCompleted(null);
}
else
{
// if this failed, there's nothing much more we can do
logger.Log("Process " + Id + " got command send failure " + completed.Result);
command.OnCompleted(completed.Result);
}
}
public void ToQueued()
{
lock (this)
{
Debug.Assert(state == State.Initializing);
state = State.Queued;
}
watcher.OnQueued();
}
private void ToMatched(IComputer computer, long timestamp)
{
lock (this)
{
Debug.Assert(state == State.Queued);
state = State.Matched;
}
watcher.OnMatched(computer, timestamp);
}
private void ToCreated(long timestamp)
{
lock (this)
{
Debug.Assert(state == State.Matched);
state = State.Created;
}
watcher.OnCreated(timestamp);
}
private void ToStarted(long timestamp)
{
lock (this)
{
Debug.Assert(state == State.Created);
state = State.Started;
}
watcher.OnStarted(timestamp);
}
private void ToExited(ProcessExitState exitState, long timestamp, int exitCode, string errorText)
{
lock (this)
{
if (state == State.Exited)
{
// duplicate exit; ignore it
return;
}
// this can be reached from any preceding state
state = State.Exited;
}
watcher.OnExited(exitState, timestamp, exitCode, errorText);
}
}
}