Dryad/JobBrowser/JobBrowser/Diagnosis.cs

921 lines
36 KiB
C#

/*
Copyright (c) Microsoft Corporation
All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License
at http://www.apache.org/licenses/LICENSE-2.0
THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
See the Apache Version 2.0 License for specific language governing permissions and
limitations under the License.
*/
using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using Microsoft.Research.JobObjectModel;
using Microsoft.Research.Tools;
namespace Microsoft.Research.DryadAnalysis
{
/// <summary>
/// The result of a decision (ternary booleans?)
/// </summary>
public enum Decision
{
/// <summary>
/// Yes.
/// </summary>
Yes,
/// <summary>
/// No.
/// </summary>
No,
/// <summary>
/// Cannot say.
/// </summary>
Dontknow,
/// <summary>
/// Not applicable.
/// </summary>
NA
};
/// <summary>
/// A message diagnosing a problem in a system.
/// </summary>
public class DiagnosisMessage
{
/// <summary>
/// Categorizes diagnosis messages according to their importance.
/// </summary>
public enum Importance
{
/// <summary>
/// Traces the decision flow.
/// </summary>
Tracing,
/// <summary>
/// An error occured during the diagnostic process.
/// </summary>
Error,
/// <summary>
/// Final diagnostic message.
/// </summary>
Final,
/// <summary>
/// This is a bug_ in Dryad/DryadLINQ.
/// </summary>
CoreBug,
}
/// <summary>
/// Importance of the message.
/// </summary>
public Importance MessageImportance { get; protected set; }
/// <summary>
/// Message itself.
/// </summary>
public string Message { get; protected set; }
/// <summary>
/// Additional details about the message.
/// </summary>
public string Details { get; protected set; }
/// <summary>
/// Create a new diagnosis message.
/// </summary>
/// <param name="i">Message importance.</param>
/// <param name="message">Message attached.</param>
/// <param name="details">Additional details about the message.</param>
public DiagnosisMessage(Importance i, string message, string details)
{
this.MessageImportance = i;
this.Message = message;
this.Details = details;
}
/// <summary>
/// String representation of the diagnosis message.
/// </summary>
/// <returns>A string representation.</returns>
public override string ToString()
{
return string.Format("{0} {1}", this.Message, this.Details);
}
}
/// <summary>
/// A diagnostic log is a sequence of messages.
/// </summary>
public class DiagnosisLog
{
/// <summary>
/// List of messages in the log.
/// </summary>
private List<DiagnosisMessage> messages;
/// <summary>
/// Summary of job being diagnosed.
/// </summary>
public DryadLinqJobSummary Summary { get; protected set; }
/// <summary>
/// Job being diagnosed.
/// </summary>
public DryadLinqJobInfo Job { get; protected set; }
/// <summary>
/// Create a new diagnostic log.
/// </summary>
public DiagnosisLog(DryadLinqJobInfo job, DryadLinqJobSummary summary)
{
this.messages = new List<DiagnosisMessage>();
this.Summary = summary;
this.Job = job;
}
/// <summary>
/// Add a new message to the log.
/// </summary>
/// <param name="msg">Message to add to log.</param>
public void AddMessage(DiagnosisMessage msg)
{
this.messages.Add(msg);
}
/// <summary>
/// The part of the log with high enough severity.
/// </summary>
/// <param name="cutoff">Do not show messages below this severity.</param>
/// <returns>A string representation of all suitable messages in the log.</returns>
public IEnumerable<string> Message(DiagnosisMessage.Importance cutoff)
{
bool coreBugFound = false;
IEnumerable<DiagnosisMessage> suitable = this.messages.Where(m => m.MessageImportance >= cutoff);
// remove duplicated messages
Dictionary<string, Tuple<int, DiagnosisMessage>> repeats = new Dictionary<string,Tuple<int,DiagnosisMessage>>();
foreach (DiagnosisMessage s in suitable) {
if (s.MessageImportance == DiagnosisMessage.Importance.CoreBug)
coreBugFound = true;
if (repeats.ContainsKey(s.Message))
repeats[s.Message] = Tuple.Create(repeats[s.Message].Item1+1, repeats[s.Message].Item2);
else
repeats[s.Message] = new Tuple<int,DiagnosisMessage>(1, s);
}
foreach (var m in repeats) {
var count = m.Value.Item1;
var msg = m.Value.Item2.ToString();
if (count > 1)
msg += " [repeated " + count + " times]";
yield return msg;
}
if (coreBugFound)
{
yield return "This is a bug in the underlying system (Dryad/DryadLINQ/Quincy). You can report this diagnosis to the DryadLINQ developers at drylnqin";
}
}
/// <summary>
/// Default string representation.
/// </summary>
/// <returns>A string representation of the log.</returns>
public IEnumerable<string> Message()
{
// ReSharper disable once IntroduceOptionalParameters.Global
return this.Message(DiagnosisMessage.Importance.Final);
}
/// <summary>
/// String representation of the log contents.
/// </summary>
/// <returns>A big string containing each line of the log on a separate line.</returns>
public override string ToString()
{
return string.Join("\n", this.Message().ToArray());
}
}
/// <summary>
/// Base class for failure diagnoses.
/// </summary>
public abstract class FailureDiagnosis
{
/// <summary>
/// Log used to write the diagnosis messages.
/// </summary>
protected DiagnosisLog diagnosisLog;
/// <summary>
/// Summary of the job being diagnosed.
/// </summary>
public DryadLinqJobSummary Summary
{
get;
protected set;
}
/// <summary>
/// Cluster where the job resides.
/// </summary>
protected readonly ClusterConfiguration cluster;
/// <summary>
/// Job that owns the vertex.
/// </summary>
public DryadLinqJobInfo Job { get; protected set; }
/// <summary>
/// Communication manager.
/// </summary>
public CommManager Manager { get; protected set; }
/// <summary>
/// Plan of the job.
/// </summary>
public DryadJobStaticPlan StaticPlan { get; protected set; }
/// <summary>
/// Create a FailureDiagnosis object.
/// </summary>
/// <param name="job">Job being diagnosed.</param>
/// <param name="plan">Static plan of the job.</param>
/// <param name="manager">Communication manager.</param>
protected FailureDiagnosis(DryadLinqJobInfo job, DryadJobStaticPlan plan, CommManager manager)
{
this.Job = job;
this.StaticPlan = plan;
this.Manager = manager;
this.Summary = job.Summary;
this.cluster = job.ClusterConfiguration;
}
/// <summary>
/// Try to find the job information from cluster and summary.
/// </summary>
/// <param name="manager">Communication manager.</param>
protected void FindJobInfo(CommManager manager)
{
DryadLinqJobInfo jobinfo = DryadLinqJobInfo.CreateDryadLinqJobInfo(this.cluster, this.Summary, true, manager);
if (jobinfo == null)
{
manager.Status("Cannot collect information for " + Summary.ShortName() + " to diagnose", StatusKind.Error);
return;
}
this.Job = jobinfo;
this.StaticPlan = JobObjectModel.DryadJobStaticPlan.CreatePlan(jobinfo, manager);
}
/// <summary>
/// Create a failure diagnosis when the job info is not yet known.
/// </summary>
/// <param name="config">Cluster where job resides.</param>
/// <param name="summary">Job summary.</param>
/// <param name="manager">Communication manager.</param>
protected FailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager)
{
this.cluster = config;
this.Summary = summary;
this.Manager = manager;
this.FindJobInfo(manager);
}
/// <summary>
/// Write a log message to the diagnosis log.
/// </summary>
/// <param name="importance">Message importance.</param>
/// <param name="message">Message to write.</param>
/// <param name="details">Additional message details.</param>
protected void Log(DiagnosisMessage.Importance importance, string message, string details)
{
this.diagnosisLog.AddMessage(new DiagnosisMessage(importance, message, details));
}
}
#region COMMON_DIAGNOSIS
// This is diagnosis which is mostly Dryad dependent, so it is independent on the cluster platform.
/// <summary>
/// Diagnoses the failure of a vertex.
/// </summary>
public class VertexFailureDiagnosis : FailureDiagnosis
{
/// <summary>
/// Vertex that is being diagnosed.
/// </summary>
public ExecutedVertexInstance Vertex { get; protected set; }
/// <summary>
/// Create a class to diagnose the problems of a vertex.
/// </summary>
/// <param name="vertex">Vertex to diagnose.</param>
/// <param name="job">Job containing the vertex.</param>
/// <param name="plan">Plan of the executed job.</param>
/// <param name="manager">Communication manager.</param>
protected VertexFailureDiagnosis(DryadLinqJobInfo job, DryadJobStaticPlan plan, ExecutedVertexInstance vertex, CommManager manager)
: base(job, plan, manager)
{
this.Job = job;
this.Vertex = vertex;
// ReSharper disable once DoNotCallOverridableMethodsInConstructor
this.stackTraceFile = "dryadLinqStackTrace.txt";
}
/// <summary>
/// Create a VertexFailureDiagnosis of the appropriate type.
/// </summary>
/// <param name="vertex">Vertex to diagnose.</param>
/// <param name="job">Job containing the vertex.</param>
/// <param name="manager">Communication manager.</param>
/// <returns>A subclass of VertexFailureDiagnosis.</returns>
/// <param name="plan">Plan of the executed job.</param>
public static VertexFailureDiagnosis CreateVertexFailureDiagnosis(DryadLinqJobInfo job,
DryadJobStaticPlan plan,
ExecutedVertexInstance vertex,
CommManager manager)
{
ClusterConfiguration config = job.ClusterConfiguration;
if (config is CacheClusterConfiguration)
config = (config as CacheClusterConfiguration).ActualConfig(job.Summary);
throw new InvalidOperationException("Config of type " + config.TypeOfCluster + " not handled");
}
/// <summary>
/// The main function of the diagnosis.
/// </summary>
/// <param name="log">Log where explanation is written.</param>
/// <returns>The decision of the diagnosis.</returns>
public virtual Decision Diagnose(DiagnosisLog log)
{
throw new InvalidOperationException("Must override this function");
}
/// <summary>
/// Diagnose a vertex.
/// </summary>
/// <returns>The log of the diagnostic.</returns>
public DiagnosisLog Diagnose()
{
DiagnosisLog log = new DiagnosisLog(this.Job, this.Summary);
log.AddMessage(new DiagnosisMessage(DiagnosisMessage.Importance.Final, "Diagnostic for " + this.VertexName, "Vertex state is " + this.Vertex.State));
this.Diagnose(log);
this.Manager.Status("Vertex diagnosis complete", StatusKind.OK);
return log;
}
/// <summary>
/// Return the vertex logs.
/// </summary>
/// <param name="errorLogs">If true return only the error logs.</param>
/// <returns>An iterator over all log files.</returns>
public virtual IEnumerable<IClusterResidentObject> Logs(bool errorLogs)
{
IClusterResidentObject logdir = this.Job.ClusterConfiguration.ProcessLogDirectory(this.Vertex.ProcessIdentifier, this.Vertex.VertexIsCompleted, this.Vertex.Machine, this.Job.Summary);
string pattern = this.Job.ClusterConfiguration.VertexLogFilesPattern(errorLogs, this.Job.Summary);
if (logdir.Exception != null)
yield break;
IEnumerable<IClusterResidentObject> logs = logdir.GetFilesAndFolders(pattern);
foreach (var l in logs)
{
if (l.Exception == null)
yield return l;
}
}
/// <summary>
/// Detect whether the vertex had problems reading a particular channel.
/// </summary>
/// <returns>The channel that cannot be read, or null if that's not the problem.</returns>
/// <param name="manager">Communication manager.</param>
public virtual ChannelEndpointDescription ChannelReadFailure(CommManager manager)
{
List<string> stack = this.StackTrace().ToList();
if (stack.Count == 0)
return null;
string firstLine = stack.First();
Regex errorMsg = new Regex(@"(.*)Exception: (.*)ailed to read from input channel at port (\d+)");
Match m = errorMsg.Match(firstLine);
if (!m.Success)
return null;
int channelNo;
bool success = int.TryParse(m.Groups[3].Value, out channelNo);
if (!success)
return null;
this.Vertex.DiscoverChannels(true, false, true, manager);
var channels = this.Vertex.InputChannels;
if (channels == null)
return null;
if (channels.Count < channelNo)
{
this.Log(DiagnosisMessage.Importance.Error, "Could not discover channel " + channelNo, this.VertexName);
return null;
}
return channels[channelNo];
}
/// <summary>
/// Detect whether vertex terminates with a stack overflow.
/// </summary>
/// <returns>True if this seems likely.</returns>
protected virtual Decision StackOverflow()
{
IClusterResidentObject stdout = this.Job.ClusterConfiguration.ProcessStdoutFile(this.Vertex.ProcessIdentifier, this.Vertex.VertexIsCompleted, this.Vertex.Machine, this.Job.Summary);
if (stdout.Exception != null)
return Decision.Dontknow;
ISharedStreamReader sr = stdout.GetStream();
while (!sr.EndOfStream)
{
string line = sr.ReadLine();
if (line.Contains("StackOverflowException"))
{
this.Log(DiagnosisMessage.Importance.Final, "Error found in vertex stderr:", line);
sr.Close();
return Decision.Yes;
}
}
sr.Close();
return Decision.Dontknow;
}
/// <summary>
/// Try to diagnose whether there's a CLR mismatch error.
/// </summary>
/// <returns>True if this is the problem.</returns>
protected bool CLRStartupProblems()
{
IClusterResidentObject stdout = this.Job.ClusterConfiguration.ProcessStdoutFile(this.Vertex.ProcessIdentifier, this.Vertex.VertexIsCompleted, this.Vertex.Machine, this.Job.Summary);
if (stdout.Exception != null)
return false;
ISharedStreamReader sr = stdout.GetStream();
// only look for the error in the first 10 lines
for (int i = 0; i < 10; i++)
{
if (sr.EndOfStream)
{
sr.Close();
return false;
}
string line = sr.ReadLine();
if (line.Contains("Error code 2148734720 (0x80131700)"))
{
this.Log(DiagnosisMessage.Importance.Final, "Error found in vertex stdout:", line);
sr.Close();
return true;
}
}
sr.Close();
return false;
}
/// <summary>
/// Name of vertex that is being diagnosed.
/// </summary>
public string VertexName { get { return this.Vertex.Name; } }
/// <summary>
/// Name of the file containing the stack trace.
/// </summary>
public string stackTraceFile { get; protected set; }
/// <summary>
/// The stack trace of the vertex at the time of the crash.
/// </summary>
/// <returns>The stack trace or an empty collection.</returns>
public virtual IEnumerable<string> StackTrace()
{
IClusterResidentObject logdir = this.Job.ClusterConfiguration.ProcessWorkDirectory(this.Vertex.ProcessIdentifier, this.Vertex.VertexIsCompleted, this.Vertex.Machine, this.Job.Summary);
IClusterResidentObject stackTrace = logdir.GetFile(this.stackTraceFile);
ISharedStreamReader sr = stackTrace.GetStream();
if (sr.Exception == null)
{
while (! sr.EndOfStream)
yield return sr.ReadLine();
}
else
yield break;
}
/// <summary>
/// Check whether this vertex is reading from a job input.
/// </summary>
/// <returns>The list of input stages this vertex is reading from.</returns>
protected IEnumerable<DryadJobStaticPlan.Stage> VertexIsReadingFromJobInput()
{
if (this.StaticPlan == null)
yield break;
string stage = this.Vertex.StageName;
DryadJobStaticPlan.Stage staticStage = this.StaticPlan.GetStageByName(stage);
if (staticStage == null)
yield break;
foreach (DryadJobStaticPlan.Connection connection in this.StaticPlan.GetStageConnections(staticStage, true))
{
var input = connection.From;
if (input.IsInput)
yield return input;
}
}
/// <summary>
/// Detect whether a problem with incorrect serialization may be the reason for job failure.
/// </summary>
/// <returns>Yes if serialization is the issue.</returns>
protected virtual Decision SerializationError()
{
// two things must have happened:
// - the vertex must be reading from an input file
// - the vertex has failed with an error during reading
var inputStages = this.VertexIsReadingFromJobInput().Select(s => s.Uri).ToArray();
if (inputStages.Count() == 0)
return Decision.Dontknow;
Decision decision = this.VertexFailedWhenReading();
if (decision != Decision.Yes)
return decision;
this.Log(DiagnosisMessage.Importance.Final,
"A vertex failed with an error that may be indicative of incorrect data serialization.",
"Make sure that the program uses the correct data type for the used input files (and also it is using the proper combination of serialization attributes)."
);
this.Log(DiagnosisMessage.Importance.Final,
"The failed vertex is reading from the following input(s): ",
string.Join(",", inputStages));
return decision;
}
/// <summary>
/// Yes if this vertex had a read error.
/// </summary>
/// <returns>A decision.</returns>
protected virtual Decision VertexFailedWhenReading()
{
// Look for a DryadRecordReader string on the stack trace
foreach (string s in this.StackTrace())
{
if (s.Contains("DryadRecordReader"))
return Decision.Yes;
}
return Decision.Dontknow;
}
/// <summary>
/// If true the vertex died with an assertion failure.
/// </summary>
public bool DiedWithAssertion { get; protected set; }
}
/// <summary>
/// Diagnose failures in a job.
/// </summary>
public abstract class JobFailureDiagnosis : FailureDiagnosis
{
/// <summary>
/// Job manager vertex.
/// </summary>
protected readonly ExecutedVertexInstance jobManager;
/// <summary>
/// Create a class to diagnose the problems of a job.
/// </summary>
/// <param name="job">Job to diagnose.</param>
/// <param name="plan">Plan of the diagnosed job.</param>
/// <param name="manager">Communication manager.</param>
protected JobFailureDiagnosis(DryadLinqJobInfo job, DryadJobStaticPlan plan, CommManager manager)
: base(job, plan, manager)
{
this.diagnosisLog = new DiagnosisLog(job, job.Summary);
this.jobManager = this.Job.ManagerVertex;
}
/// <summary>
/// Create a class to diagnose the problems of a job.
/// </summary>
/// <param name="config">Cluster where job resides.</param>
/// <param name="manager">Communication manager.</param>
/// <param name="summary">Job summary.</param>
protected JobFailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager)
: base(config, summary, manager)
{
this.diagnosisLog = new DiagnosisLog(this.Job, summary);
if (this.Job != null)
this.jobManager = this.Job.ManagerVertex;
}
/// <summary>
/// Decide whether the job has finished executing.
/// </summary>
/// <returns>A decision.</returns>
protected Decision IsJobFinished()
{
bool dec = ClusterJobInformation.JobIsFinished(this.Summary.Status);
return dec ? Decision.Yes : Decision.No;
}
/// <summary>
/// Decide whether the job has failed.
/// </summary>
/// <returns>A decision.</returns>
protected Decision IsJobFailed()
{
switch (this.Summary.Status)
{
case ClusterJobInformation.ClusterJobStatus.Failed:
return Decision.Yes;
case ClusterJobInformation.ClusterJobStatus.Cancelled:
case ClusterJobInformation.ClusterJobStatus.Succeeded:
return Decision.No;
case ClusterJobInformation.ClusterJobStatus.Running:
// job may still fail.
case ClusterJobInformation.ClusterJobStatus.Unknown:
return Decision.Dontknow;
default:
throw new InvalidDataException("Invalid job status " + this.Summary.Status);
}
}
/// <summary>
/// Main entry point: diagnose the failures of a job.
/// </summary>
/// <returns>The log containing the diagnosis result.</returns>
public virtual DiagnosisLog Diagnose()
{
throw new InvalidOperationException("Must override this function");
}
/// <summary>
/// Discover whether the failure is caused by the inability to parse the XML plan.
/// </summary>
/// <returns>The decision.</returns>
public Decision XmlPlanParseError()
{
if (this.jobManager == null)
{
this.Log(DiagnosisMessage.Importance.Tracing, "Could not find job manager vertex information", "");
return Decision.Dontknow;
}
IClusterResidentObject jmstdout = this.jobManager.StdoutFile;
if (jmstdout.Exception != null)
{
this.Log(DiagnosisMessage.Importance.Tracing, "Could not find job manager standard output", "");
return Decision.Dontknow;
}
ISharedStreamReader sr = jmstdout.GetStream();
if (sr.Exception != null)
{
this.Log(DiagnosisMessage.Importance.Tracing, "Could not read job manager standard output", sr.Exception.Message);
return Decision.Dontknow;
}
string firstline = sr.ReadLine();
if (sr.EndOfStream || firstline == null)
{
sr.Close();
return Decision.No;
}
sr.Close();
if (firstline.Contains("Error parsing input XML file"))
{
this.Log(DiagnosisMessage.Importance.Final, "The job manager cannot parse the XML plan file.\n",
"This means probably that the version of LinqToDryad.dll that you are using does not match the XmlExecHost.exe file from your drop.");
return Decision.Yes;
}
return Decision.No;
}
/// <summary>
/// Find if a vertex has had many instances failed.
/// </summary>
/// <returns>The vertex that failed many times.</returns>
protected ExecutedVertexInstance LookForRepeatedVertexFailures()
{
IEnumerable<ExecutedVertexInstance> failures =
this.Job.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Failed).
Where(v => !v.IsManager).
ToList();
if (failures.Count() == 0)
return null;
var mostFailed = failures.GroupBy(v => v.Name).OrderBy(g => -g.Count()).First();
if (mostFailed.Count() > 3)
return mostFailed.First();
return null;
}
/// <summary>
/// Find if a vertex has had many instances failed.
/// </summary>
/// <returns>The vertex that failed many times.</returns>
protected ExecutedVertexInstance LookForManyVertexFailures()
{
IEnumerable<ExecutedVertexInstance> failures =
this.Job.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Failed).
Where(v => !v.IsManager).
ToList();
if (failures.Count() < 5)
return null;
var mostFailed = failures.GroupBy(v => v.Name).OrderBy(g => -g.Count()).First();
return mostFailed.First();
}
/// <summary>
/// Find multiple failures on the same machine.
/// </summary>
/// <returns>Yes if there are some.</returns>
protected Decision LookForCorrelatedMachineFailures()
{
// if we have more than this many failures we start to worry
const int maxFailures = 5;
IEnumerable<ExecutedVertexInstance> failures =
this.Job.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Failed).
Where(v => !v.IsManager).
ToList();
int totalFailures = failures.Count();
if (totalFailures < maxFailures)
return Decision.No;
var mostFailures = failures.GroupBy(v => v.Machine).OrderBy(g => -g.Count()).First();
string failMachine = mostFailures.Key;
if (mostFailures.Count() > totalFailures / 3 || mostFailures.Count() > 4)
{
this.Log(DiagnosisMessage.Importance.Final,
"There are " + mostFailures.Count() + " failures on machine " + failMachine,
"Total number of failures is " + totalFailures);
return Decision.Yes;
}
return Decision.Dontknow;
}
/// <summary>
/// Check to see whether a vertex has failed deterministically too many times.
/// </summary>
/// <returns>Identify of the failed vertex, or null if no such vertex exists.</returns>
protected virtual ExecutedVertexInstance DeterministicVertexFailure()
{
string abortmsg = this.Job.AbortingMsg;
if (abortmsg == null)
return null;
// ABORTING: Vertex failed too many times. Vertex 2 (OrderBy__0) number of failed executions 6
Regex manyFaileRegex = new Regex(@"Vertex failed too many times. Vertex (\d+) \((.*)\) number of failed executions (\d+)");
Match m = manyFaileRegex.Match(this.Job.AbortingMsg);
if (!m.Success)
return null;
string name = m.Groups[2].Value;
string failures = m.Groups[3].Value;
this.Log(DiagnosisMessage.Importance.Final, string.Format("Job was aborted because vertex {0} failed {1} times", name, failures), "");
IEnumerable<ExecutedVertexInstance> failed = this.Job.Vertices.Where(vi => vi.Name == name && vi.State == ExecutedVertexInstance.VertexState.Failed).ToList();
if (failed.Count() == 0)
{
this.Log(DiagnosisMessage.Importance.Error, "Cannot find information about failed vertex", name);
return null;
}
return failed.First();
}
/// <summary>
/// Yes if the job dies because a vertex fails too many times to read the main job input.
/// </summary>
/// <returns>A decision indicating whether an input cannot be read.</returns>
protected virtual Decision DeterministicInputFailure()
{
if (string.IsNullOrEmpty(this.Job.AbortingMsg))
return Decision.No;
if (this.Job.AbortingMsg.Contains("read failures"))
{
this.Log(DiagnosisMessage.Importance.Final, "Job cannot read some input data", this.Job.AbortingMsg);
return Decision.Yes;
}
return Decision.Dontknow;
}
/// <summary>
/// Create a suitable Job Failure diagnosis object for the job being analyzed.
/// </summary>
/// <param name="job">Job to diagnose.</param>
/// <param name="manager">Communication manager.</param>
/// <returns>A subclass of JobFailureDiagnosis with the type appropriate for the job.</returns>
/// <param name="plan">Plan of the job being diagnosed.</param>
public static JobFailureDiagnosis CreateJobFailureDiagnosis(DryadLinqJobInfo job, DryadJobStaticPlan plan, CommManager manager)
{
ClusterConfiguration config = job.ClusterConfiguration;
if (config is CacheClusterConfiguration)
config = (config as CacheClusterConfiguration).ActualConfig(job.Summary);
throw new InvalidOperationException("Configuration of type " + config.TypeOfCluster + " not supported for diagnosis");
}
/// <summary>
/// Create a suitable Job Failure diagnosis object for the job being analyzed.
/// </summary>
/// <param name="summary">Job to diagnose.</param>
/// <param name="config">Cluster where job resides.</param>
/// <param name="manager">Communication manager.</param>
/// <returns>A subclass of JobFailureDiagnosis with the type appropriate for the job.</returns>
public static JobFailureDiagnosis CreateJobFailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager)
{
if (config is CacheClusterConfiguration)
config = (config as CacheClusterConfiguration).ActualConfig(summary);
throw new InvalidOperationException("Configuration of type " + config.TypeOfCluster + " not supported for diagnosis");
}
/// <summary>
/// Look to see whether the vertices failed reading from some common set of machines.
/// This is incomplete: e.g., it does not work for tidyfs streams.
/// </summary>
/// <returns>Yes if there were correlated failures.</returns>
/// <param name="manager">Communication manager.</param>
protected Decision LookForCorrelatedReadFailures(CommManager manager)
{
// if we have more than this many failures we start to worry
const int maxFailures = 5;
IEnumerable<ExecutedVertexInstance> failures =
this.Job.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Failed).
Where(v => !v.IsManager).
ToList();
int totalFailures = failures.Count();
if (totalFailures < maxFailures)
return Decision.No;
List<ChannelEndpointDescription> channelsFailed = new List<ChannelEndpointDescription>();
int verticesDone = 0;
foreach (ExecutedVertexInstance v in failures)
{
var crf = VertexFailureDiagnosis.CreateVertexFailureDiagnosis(this.Job, this.StaticPlan, v, manager).ChannelReadFailure(manager);
if (crf != null)
{
channelsFailed.Add(crf);
}
verticesDone++;
manager.Progress(verticesDone * 100 / totalFailures);
}
if (channelsFailed.Count() < maxFailures)
return Decision.No;
this.Log(DiagnosisMessage.Importance.Final, "There are " + channelsFailed.Count() + " read failures in the job", "");
var files = channelsFailed.Where(ced => ced.UriType == "file").ToList();
if (files.Count() == 0)
{
this.Log(DiagnosisMessage.Importance.Final, "All channels with failures are distributed files", "No further information is available");
return Decision.Dontknow;
}
Decision result = Decision.Dontknow;
var machines = files.Select(f => new UNCPathname(f.LocalPath).Machine).GroupBy(w => w).ToList();
foreach (var m in machines)
{
int failuresOnM = m.Count();
if (failuresOnM > 3)
{
this.Log(DiagnosisMessage.Importance.Final, "There are " + failuresOnM + " read failures reading from machine", m.Key);
result = Decision.Yes;
}
}
return result;
}
}
#endregion
#region COSMOS_DIAGNOSIS
#region JOB_STILL_RUNNING
#endregion
#endregion
#region SCOPE_DIAGNOSIS
#region JOB_STILL_RUNNING
#endregion
#endregion
#region HPC_DIAGNOSIS
#region JOB_STILL_RUNNING
#endregion
#endregion
#region L2H_JOB_DIAGNOSIS
#region JOB_STILL_RUNNING
#endregion
#region HPC_ERRORS
#endregion
#endregion
}