643 lines
27 KiB
C#
643 lines
27 KiB
C#
|
|
/*
|
|
Copyright (c) Microsoft Corporation
|
|
|
|
All rights reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
|
|
compliance with the License. You may obtain a copy of the License
|
|
at http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
|
|
EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
|
|
TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
|
|
|
|
|
|
See the Apache Version 2.0 License for specific language governing permissions and
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Threading.Tasks;
|
|
using Microsoft.Research.Calypso.Tools;
|
|
|
|
namespace Microsoft.Research.Calypso.JobObjectModel
|
|
{
|
|
/// <summary>
|
|
/// Dynamic information of all the jobs and machines in a cluster.
|
|
/// </summary>
|
|
public abstract class ClusterStatus
|
|
{
|
|
/// <summary>
|
|
/// The static configuration of the cluster
|
|
/// </summary>
|
|
public ClusterConfiguration Config { get; protected set; }
|
|
/// <summary>
|
|
/// Cache here the collection of cluster jobs on the cluster; index is cluster job guid.
|
|
/// </summary>
|
|
protected Dictionary<string, ClusterJobInformation> clusterJobs;
|
|
/// <summary>
|
|
/// For each machine the pod it belongs to.
|
|
/// </summary>
|
|
protected Dictionary<string, string> pods;
|
|
|
|
/// <summary>
|
|
/// Create a cluster status.
|
|
/// </summary>
|
|
/// <param name="config">Cluster configuration.</param>
|
|
protected ClusterStatus(ClusterConfiguration config)
|
|
{
|
|
this.Config = config;
|
|
this.Initialize();
|
|
if (ClusterStatuses.ContainsKey(config.Name))
|
|
ClusterStatuses[config.Name] = this;
|
|
else
|
|
ClusterStatuses.Add(config.Name, this);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cache each for each cluster its status.
|
|
/// </summary>
|
|
static Dictionary<string, ClusterStatus> ClusterStatuses = new Dictionary<string, ClusterStatus>();
|
|
|
|
/// <summary>
|
|
/// See if a status is already cached.
|
|
/// </summary>
|
|
/// <param name="clusterName">Name of cluster.</param>
|
|
/// <returns>The cached status.</returns>
|
|
public static ClusterStatus LookupStatus(string clusterName)
|
|
{
|
|
if (ClusterStatuses.ContainsKey(clusterName))
|
|
return ClusterStatuses[clusterName];
|
|
return null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Initialize the cluster status.
|
|
/// </summary>
|
|
private void Initialize()
|
|
{
|
|
this.pods = null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// The cluster map: for each machine the pod it resides in.
|
|
/// </summary>
|
|
/// <returns>An iterator over the cluster map.</returns>
|
|
public IEnumerable<KeyValuePair<string, string>> Pods()
|
|
{
|
|
if (this.pods == null)
|
|
{
|
|
this.pods = this.QueryClusterTopology();
|
|
}
|
|
return this.pods.AsEnumerable();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Discover the way machines are organized into racks.
|
|
/// </summary>
|
|
protected abstract Dictionary<string, string> QueryClusterTopology();
|
|
|
|
/// <summary>
|
|
/// Generate the list of machines which may be in the cluster.
|
|
/// </summary>
|
|
/// <returns>An iterator returning all machines in the cluster.</returns>
|
|
public IEnumerable<string> GetClusterMachines()
|
|
{
|
|
if (this.pods == null)
|
|
this.pods = this.QueryClusterTopology();
|
|
return this.pods.Select(m => m.Key);
|
|
}
|
|
|
|
/// <summary>
|
|
/// The cached of tasks on the cluster.
|
|
/// </summary>
|
|
/// <param name="virtualCluster">Virtual cluster selected; defined only for Scope clusters.</param>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
public IEnumerable<ClusterJobInformation> GetClusterJobList(string virtualCluster, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
this.RecomputeClusterJobList(virtualCluster, reporter, reportProgress);
|
|
return this.clusterJobs.Values.ToList();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Force the recomputation of the cluster job list.
|
|
/// </summary>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
// ReSharper disable once UnusedParameter.Global
|
|
protected abstract void RecomputeClusterJobList(string virtualCluster, StatusReporter reporter, Action<int> reportProgress);
|
|
|
|
/// <summary>
|
|
/// Discover the (unique) dryadlinq job corresponding to a cluster job.
|
|
/// </summary>
|
|
/// <param name="clusterJob">Cluster Job.</param>
|
|
/// <returns>The job description.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
public abstract DryadLinqJobSummary DiscoverDryadLinqJobFromClusterJob(ClusterJobInformation clusterJob, StatusReporter reporter);
|
|
|
|
/// <summary>
|
|
/// Discover the dryadlinq job given a url from the cluster scheduler.
|
|
/// (Does not make sense for some cluster architectures, e.g., HPC.)
|
|
/// </summary>
|
|
/// <param name="url">URL pointing to the job.</param>
|
|
/// <returns>The dryadlinq job summary.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
// ReSharper disable UnusedParameter.Global
|
|
public abstract DryadLinqJobSummary DiscoverDryadLinqJobFromURL(string url, StatusReporter reporter);
|
|
// ReSharper restore UnusedParameter.Global
|
|
|
|
/// <summary>
|
|
/// Discover a cluster job given its id.
|
|
/// </summary>
|
|
/// <param name="job">Job to discover.</param>
|
|
/// <returns>The cluster job, or null if not found.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
public virtual ClusterJobInformation DiscoverClusterJob(DryadLinqJobSummary job, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
if (this.clusterJobs == null)
|
|
this.RecomputeClusterJobList(job.VirtualCluster, reporter, reportProgress);
|
|
return this.clusterJobs[job.ClusterJobId];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Refresh the job summary status.
|
|
/// </summary>
|
|
/// <param name="summary">Summary to refresh.</param>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
public virtual void RefreshStatus(DryadLinqJobSummary summary, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
// refresh the whole list
|
|
this.RecomputeClusterJobList(summary.VirtualCluster, reporter, reportProgress);
|
|
ClusterJobInformation info = this.DiscoverClusterJob(summary, reporter, reportProgress);
|
|
if (info == null)
|
|
{
|
|
summary.Status = ClusterJobInformation.ClusterJobStatus.Unknown;
|
|
return;
|
|
}
|
|
summary.Status = info.Status;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cancel the specified job.
|
|
/// </summary>
|
|
/// <param name="job">Job whose execution is cancelled.</param>
|
|
/// <returns>True if the cancellation succeeded.</returns>
|
|
public abstract bool CancelJob(DryadLinqJobSummary job);
|
|
}
|
|
|
|
|
|
|
|
/// <summary>
|
|
/// A fake cluster keeps some information about past jobs on a local filesystem, to allow post-mortem debugging.
|
|
/// </summary>
|
|
public class CacheClusterStatus : ClusterStatus
|
|
{
|
|
/// <summary>
|
|
/// Create a fake cluster status.
|
|
/// </summary>
|
|
/// <param name="config">Configuration to use for this cluster.</param>
|
|
public CacheClusterStatus(ClusterConfiguration config)
|
|
: base(config)
|
|
{
|
|
if (!(config is CacheClusterConfiguration))
|
|
throw new ArgumentException("Expected configuration to be for a cache cluster");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Not implemented; return an empty topology.
|
|
/// </summary>
|
|
/// <returns></returns>
|
|
protected override Dictionary<string, string> QueryClusterTopology()
|
|
{
|
|
return new Dictionary<string, string>();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Recompute the list of jobs on the cluster and add them to the clusterJobs field.
|
|
/// </summary>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="virtualCluster">Unused.</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
protected override void RecomputeClusterJobList(string virtualCluster, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
this.clusterJobs = new Dictionary<string, ClusterJobInformation>();
|
|
if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory))
|
|
return;
|
|
|
|
string joblist = Path.Combine(CachedClusterResidentObject.CacheDirectory, "jobs");
|
|
if (!Directory.Exists(joblist))
|
|
Directory.CreateDirectory(joblist);
|
|
|
|
string[] files = Directory.GetFiles(joblist, "*.xml");
|
|
foreach (var file in files)
|
|
{
|
|
DryadLinqJobSummary job = Utilities.LoadXml<DryadLinqJobSummary>(file);
|
|
string cjid = job.Cluster + "-" + job.ClusterJobId; // there may be two jobs with same id from different clusters
|
|
ClusterJobInformation ci = new ClusterJobInformation(this.Config.Name, job.Cluster, cjid, job.Name, job.User, job.Date, job.EndTime - job.Date, job.Status);
|
|
ci.SetAssociatedSummary(job);
|
|
if (this.clusterJobs.ContainsKey(cjid))
|
|
{
|
|
reporter("Duplicate job id, cannot insert in cache " + job.AsIdentifyingString(), StatusKind.Error);
|
|
continue;
|
|
}
|
|
this.clusterJobs.Add(cjid, ci);
|
|
}
|
|
reportProgress(100);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Refresh the job summary status.
|
|
/// </summary>
|
|
/// <param name="job">Summary to refresh.</param>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="reportProgress">Used to report progres.</param>
|
|
public override void RefreshStatus(DryadLinqJobSummary job, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
ClusterConfiguration actual = (this.Config as CacheClusterConfiguration).ActualConfig(job);
|
|
ClusterStatus actualStatus = actual.CreateClusterStatus();
|
|
actualStatus.RefreshStatus(job, reporter, reportProgress);
|
|
ClusterJobInformation info = actualStatus.DiscoverClusterJob(job, reporter, reportProgress);
|
|
if (info == null)
|
|
{
|
|
job.Status = ClusterJobInformation.ClusterJobStatus.Unknown;
|
|
return;
|
|
}
|
|
job.Status = info.Status;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Not needed, all summaries are already known.
|
|
/// </summary>
|
|
/// <param name="clusterJob">Cluster job information.</param>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <returns>Throws an exception.</returns>
|
|
public override DryadLinqJobSummary DiscoverDryadLinqJobFromClusterJob(ClusterJobInformation clusterJob, StatusReporter reporter)
|
|
{
|
|
throw new InvalidOperationException();
|
|
}
|
|
|
|
/// <summary>
|
|
/// This functionality is not available for cached jobs.
|
|
/// </summary>
|
|
/// <param name="url">Job url.</param>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <returns>Throws an exception.</returns>
|
|
public override DryadLinqJobSummary DiscoverDryadLinqJobFromURL(string url, StatusReporter reporter)
|
|
{
|
|
throw new InvalidOperationException();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Not needed, all summaries are already known.
|
|
/// </summary>
|
|
/// <param name="job">Cluster job.</param>
|
|
/// <returns>Throws an exception.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
public override ClusterJobInformation DiscoverClusterJob(DryadLinqJobSummary job, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
ClusterConfiguration actual = (this.Config as CacheClusterConfiguration).ActualConfig(job);
|
|
ClusterStatus actualStatus = actual.CreateClusterStatus();
|
|
return actualStatus.DiscoverClusterJob(job, reporter, reportProgress);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Not needed, these jobs are no longer running.
|
|
/// </summary>
|
|
/// <param name="job">Job to cancel.</param>
|
|
/// <returns>Throws an exception.</returns>
|
|
public override bool CancelJob(DryadLinqJobSummary job)
|
|
{
|
|
throw new InvalidOperationException();
|
|
}
|
|
}
|
|
|
|
#region IDisposable Members
|
|
#endregion
|
|
|
|
/// <summary>
|
|
/// Status of a cluster comprised only of the local machine.
|
|
/// </summary>
|
|
public class YarnEmulatedClusterStatus : ClusterStatus
|
|
{
|
|
private LocalEmulator config;
|
|
|
|
/// <summary>
|
|
/// Create a cluster containing just the local machine.
|
|
/// </summary>
|
|
/// <param name="config">Configuration for the local machine.</param>
|
|
public YarnEmulatedClusterStatus(ClusterConfiguration config)
|
|
: base(config)
|
|
{
|
|
if (!(config is LocalEmulator))
|
|
throw new ArgumentException("Expected a LocalMachineConfiguration, got a " + config.GetType());
|
|
this.config = config as LocalEmulator;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Discover the way machines are organized into racks.
|
|
/// </summary>
|
|
protected override Dictionary<string, string> QueryClusterTopology()
|
|
{
|
|
var result = new Dictionary<string, string>();
|
|
result.Add("localhost", "localhost");
|
|
return result;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Force the recomputation of the cluster job list.
|
|
/// </summary>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
protected override void RecomputeClusterJobList(string virtualCluster, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
this.clusterJobs = new Dictionary<string, ClusterJobInformation>();
|
|
if (!Directory.Exists(this.config.JobsFolder))
|
|
return;
|
|
string[] subfolders = Directory.GetDirectories(this.config.JobsFolder);
|
|
|
|
int done = 0;
|
|
foreach (var job in subfolders)
|
|
{
|
|
string jobId = Path.GetFileName(job);
|
|
ClusterJobInformation info = this.GetJobInfo(job, jobId);
|
|
if (info != null)
|
|
{
|
|
// ReSharper disable once AssignNullToNotNullAttribute
|
|
this.clusterJobs.Add(jobId, info);
|
|
}
|
|
reportProgress(done++ *100/subfolders.Length);
|
|
}
|
|
reportProgress(100);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Discover the (unique) dryadlinq job corresponding to a cluster job.
|
|
/// </summary>
|
|
/// <param name="clusterJob">Cluster Job.</param>
|
|
/// <returns>The job description.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
public override DryadLinqJobSummary DiscoverDryadLinqJobFromClusterJob(ClusterJobInformation clusterJob, StatusReporter reporter)
|
|
{
|
|
DryadLinqJobSummary result = new DryadLinqJobSummary(
|
|
clusterJob.Cluster,
|
|
this.config.TypeOfCluster,
|
|
"", // virtual cluster
|
|
"", // machine
|
|
clusterJob.ClusterJobID, // jobId
|
|
clusterJob.ClusterJobID, // clusterJobId
|
|
new DryadProcessIdentifier("jm"), // jmProcessGuid
|
|
clusterJob.Name,
|
|
clusterJob.User,
|
|
clusterJob.Date,
|
|
clusterJob.Date + clusterJob.EstimatedRunningTime,
|
|
clusterJob.Status);
|
|
return result;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Discover the dryadlinq job given a url from the cluster scheduler.
|
|
/// (Does not make sense for some cluster architectures, e.g., HPC.)
|
|
/// </summary>
|
|
/// <param name="url">URL pointing to the job.</param>
|
|
/// <returns>The dryadlinq job summary.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
public override DryadLinqJobSummary DiscoverDryadLinqJobFromURL(string url, StatusReporter reporter)
|
|
{
|
|
throw new InvalidOperationException();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Extract the job information from a folder with logs on the local machine.
|
|
/// </summary>
|
|
/// <param name="jobRootFolder">Folder with logs for the specified job.</param>
|
|
/// <returns>The job information, or null if not found.</returns>
|
|
/// <param name="jobId">Job id.</param>
|
|
private ClusterJobInformation GetJobInfo(string jobRootFolder, string jobId)
|
|
{
|
|
string jmFolder = Path.Combine(jobRootFolder, "jm");
|
|
if (!Directory.Exists(jmFolder)) return null;
|
|
|
|
var date = File.GetCreationTime(jmFolder);
|
|
ClusterJobInformation info = new ClusterJobInformation(this.config.Name, "", jobId, jobId, Environment.UserName, date, TimeSpan.Zero, ClusterJobInformation.ClusterJobStatus.Unknown);
|
|
return info;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cancel the specified job.
|
|
/// </summary>
|
|
/// <param name="job">Job whose execution is cancelled.</param>
|
|
/// <returns>True if the cancellation succeeded.</returns>
|
|
public override bool CancelJob(DryadLinqJobSummary job)
|
|
{
|
|
throw new InvalidOperationException();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Status of an Azure DFS cluster.
|
|
/// </summary>
|
|
public class AzureDfsClusterStatus : ClusterStatus
|
|
{
|
|
private AzureDfsClusterConfiguration config;
|
|
|
|
/// <summary>
|
|
/// Create a cluster containing just the local machine.
|
|
/// </summary>
|
|
/// <param name="config">Configuration for the local machine.</param>
|
|
public AzureDfsClusterStatus(ClusterConfiguration config)
|
|
: base(config)
|
|
{
|
|
if (!(config is AzureDfsClusterConfiguration))
|
|
throw new ArgumentException("Expected a AzureYarnClusterConfiguration, got a " + config.GetType());
|
|
this.config = config as AzureDfsClusterConfiguration;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Discover the way machines are organized into racks.
|
|
/// </summary>
|
|
protected override Dictionary<string, string> QueryClusterTopology()
|
|
{
|
|
var result = new Dictionary<string, string>();
|
|
result.Add("localhost", "localhost");
|
|
return result;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Force the recomputation of the cluster job list.
|
|
/// </summary>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
protected override void RecomputeClusterJobList(string virtualCluster, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
this.clusterJobs = new Dictionary<string, ClusterJobInformation>();
|
|
var jobs = this.config.AzureClient.EnumerateDirectory("").ToList();
|
|
|
|
int done = 0;
|
|
foreach (var job in jobs)
|
|
{
|
|
ClusterJobInformation info = this.GetJobInfo(job);
|
|
if (info != null)
|
|
{
|
|
// ReSharper disable once AssignNullToNotNullAttribute
|
|
this.clusterJobs.Add(job, info);
|
|
}
|
|
reportProgress(100*done++/jobs.Count);
|
|
}
|
|
reportProgress(100);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Discover the (unique) dryadlinq job corresponding to a cluster job.
|
|
/// </summary>
|
|
/// <param name="clusterJob">Cluster Job.</param>
|
|
/// <returns>The job description.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
public override DryadLinqJobSummary DiscoverDryadLinqJobFromClusterJob(ClusterJobInformation clusterJob, StatusReporter reporter)
|
|
{
|
|
DryadLinqJobSummary result = new DryadLinqJobSummary(
|
|
clusterJob.Cluster,
|
|
this.config.TypeOfCluster,
|
|
"", // virtual cluster
|
|
"", // machine
|
|
clusterJob.ClusterJobID, // jobId
|
|
clusterJob.ClusterJobID, // clusterJobId
|
|
new DryadProcessIdentifier("jm"), // jmProcessGuid
|
|
clusterJob.Name,
|
|
clusterJob.User,
|
|
clusterJob.Date,
|
|
clusterJob.Date + clusterJob.EstimatedRunningTime,
|
|
clusterJob.Status);
|
|
return result;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Discover the dryadlinq job given a url from the cluster scheduler.
|
|
/// (Does not make sense for some cluster architectures, e.g., HPC.)
|
|
/// </summary>
|
|
/// <param name="url">URL pointing to the job.</param>
|
|
/// <returns>The dryadlinq job summary.</returns>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
public override DryadLinqJobSummary DiscoverDryadLinqJobFromURL(string url, StatusReporter reporter)
|
|
{
|
|
throw new InvalidOperationException();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Extract the job information from a folder with logs on the local machine.
|
|
/// </summary>
|
|
/// <param name="jobRootFolder">Folder with logs for the specified job.</param>
|
|
/// <returns>The job information, or null if not found.</returns>
|
|
private ClusterJobInformation GetJobInfo(string jobRootFolder)
|
|
{
|
|
DateTime date = DateTime.MinValue;
|
|
DateTime lastHeartBeat = DateTime.MinValue;
|
|
ClusterJobInformation.ClusterJobStatus status = ClusterJobInformation.ClusterJobStatus.Unknown;
|
|
bool found = false;
|
|
|
|
var jobsFolders = this.config.AzureClient.EnumerateDirectory(jobRootFolder).ToList();
|
|
foreach (var file in jobsFolders)
|
|
{
|
|
if (file.Contains("DryadLinqProgram__"))
|
|
{
|
|
var blob = this.config.AzureClient.Container.GetBlockBlobReference(file);
|
|
blob.FetchAttributes();
|
|
var props = blob.Properties;
|
|
if (props.LastModified.HasValue)
|
|
{
|
|
date = props.LastModified.Value.DateTime;
|
|
date = date.ToLocalTime();
|
|
}
|
|
found = true;
|
|
}
|
|
else if (file.EndsWith("heartbeat"))
|
|
{
|
|
var blob = this.config.AzureClient.Container.GetPageBlobReference(file);
|
|
blob.FetchAttributes();
|
|
var props = blob.Metadata;
|
|
if (props.ContainsKey("status"))
|
|
{
|
|
var st = props["status"];
|
|
switch (st)
|
|
{
|
|
case "failure":
|
|
status = ClusterJobInformation.ClusterJobStatus.Failed;
|
|
break;
|
|
case "success":
|
|
status = ClusterJobInformation.ClusterJobStatus.Succeeded;
|
|
break;
|
|
case "running":
|
|
status = ClusterJobInformation.ClusterJobStatus.Running;
|
|
break;
|
|
default:
|
|
Console.WriteLine("Unknown status " + st);
|
|
break;
|
|
}
|
|
}
|
|
if (props.ContainsKey("heartbeat"))
|
|
{
|
|
var hb = props["heartbeat"];
|
|
if (DateTime.TryParse(hb, out lastHeartBeat))
|
|
{
|
|
lastHeartBeat = lastHeartBeat.ToLocalTime();
|
|
if (status == ClusterJobInformation.ClusterJobStatus.Running &&
|
|
DateTime.Now - lastHeartBeat > TimeSpan.FromSeconds(40))
|
|
// job has in fact crashed
|
|
status = ClusterJobInformation.ClusterJobStatus.Failed;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!found) return null;
|
|
|
|
TimeSpan running = TimeSpan.Zero;
|
|
if (date != DateTime.MinValue && lastHeartBeat != DateTime.MinValue)
|
|
running = lastHeartBeat - date;
|
|
var info = new ClusterJobInformation(this.config.Name, "", jobRootFolder, jobRootFolder, Environment.UserName, date, running, status);
|
|
return info;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Refresh the job summary status.
|
|
/// </summary>
|
|
/// <param name="summary">Summary to refresh.</param>
|
|
/// <param name="reporter">Delegate used to report errors.</param>
|
|
/// <param name="reportProgress">Used to report progress.</param>
|
|
public override void RefreshStatus(DryadLinqJobSummary summary, StatusReporter reporter, Action<int> reportProgress)
|
|
{
|
|
// refresh the whole list
|
|
ClusterJobInformation info = this.GetJobInfo(summary.JobID);
|
|
if (info == null)
|
|
{
|
|
summary.Status = ClusterJobInformation.ClusterJobStatus.Unknown;
|
|
return;
|
|
}
|
|
summary.Status = info.Status;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cancel the specified job.
|
|
/// </summary>
|
|
/// <param name="job">Job whose execution is cancelled.</param>
|
|
/// <returns>True if the cancellation succeeded.</returns>
|
|
public override bool CancelJob(DryadLinqJobSummary job)
|
|
{
|
|
throw new InvalidOperationException();
|
|
}
|
|
}
|
|
}
|