Dryad/JobBrowser/JOM/jobinfo.cs

3499 lines
148 KiB
C#

/*
Copyright (c) Microsoft Corporation
All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License
at http://www.apache.org/licenses/LICENSE-2.0
THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
See the Apache Version 2.0 License for specific language governing permissions and
limitations under the License.
*/
#undef USE_LINQ_TO_DRYAD
#undef USE_HPC
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using Microsoft.Research.Calypso.Tools;
using System.Diagnostics;
namespace Microsoft.Research.Calypso.JobObjectModel
{
/// <summary>
/// Exception throw by Calypso when it cannot understand the structure of a Dryad/DryadLINQ job.
/// </summary>
public class CalypsoDryadException : Exception
{
/// <summary>
/// Create a new CalypsoDryadException.
/// </summary>
/// <param name="message">Message conveyed by the exception.</param>
public CalypsoDryadException(string message) : base(message) { }
}
/// <summary>
/// Classes providing a parsing from string routine.
/// </summary>
public interface IParse
{
/// <summary>
/// Parse one line.
/// </summary>
/// <param name="line">Line to parse.</param>
void Parse(string line);
}
/// <summary>
/// One stage (a set of vertices) in a DryadLINQ job.
/// </summary>
public class DryadLinqJobStage
{
/// <summary>
/// Number of vertices defined in the static plan; 0 if unknown.
/// This field must be set explicitly, it is not computed by the constructor, since the information is not available in the set of vertices.
/// </summary>
public int StaticVertexCount { get; set; }
/// <summary>
/// Stage name.
/// </summary>
public string Name { get; protected set; }
/// <summary>
/// List of vertices in the stage.
/// </summary>
readonly IEnumerable<ExecutedVertexInstance> vertices;
/// <summary>
/// Number of executed vertices; does not include abandoned vertices.
/// </summary>
public int TotalInitiatedVertices { get; protected set; }
/// <summary>
/// Created but not yet started.
/// </summary>
public int CreatedVertices { get; protected set; }
/// <summary>
/// Vertices that have started (may still be running).
/// </summary>
public int StartedVertices { get; protected set; }
/// <summary>
/// Vertices that have completed and then have been cancelled.
/// </summary>
public int InvalidatedVertices { get; protected set; }
/// <summary>
/// Number of failed vertices.
/// </summary>
public int FailedVertices { get; protected set; }
/// <summary>
/// Vertices that have completed successfully.
/// </summary>
public int SuccessfulVertices { get; protected set; }
/// <summary>
/// Number of cancelled vertices.
/// </summary>
public int CancelledVertices { get; protected set; }
/// <summary>
/// Number of vertices abandoned before running.
/// </summary>
public int AbandonedVertices { get; protected set; }
/// <summary>
/// Number of vertices cancelled by the remote scheduler.
/// </summary>
public int RevokedVertices { get; protected set; }
/// <summary>
/// How long has this stage been running?
/// </summary>
public TimeSpan RunningTime { get { return this.EndTime - this.StartTime; } }
/// <summary>
/// Time when first vertex in stage started.
/// </summary>
public DateTime StartTime { get; protected set; }
/// <summary>
/// Time when last vertex in stage finished.
/// </summary>
public DateTime EndTime { get; protected set; }
/// <summary>
/// Amount of data read (-1 if unknown).
/// </summary>
public long DataRead { get; protected set; }
/// <summary>
/// Amount of data written (-1 if unknown).
/// </summary>
public long DataWritten { get; protected set; }
/// <summary>
/// Information about the vertices executed in this stage.
/// </summary>
public IEnumerable<ExecutedVertexInstance> Vertices { get { return this.vertices; } }
/// <summary>
/// Create a DryadLinqJobStage from a set of vertices.
/// </summary>
/// <param name="stagename">Name of stage.</param>
/// <param name="vertices">Set of vertices contained in the stage.</param>
public DryadLinqJobStage(string stagename, List<ExecutedVertexInstance> vertices)
{
this.StaticVertexCount = 0; // not yet known
this.DataRead = vertices.Select(v => v.DataRead).Sum();
this.DataWritten = vertices.Select(v => v.DataWritten).Sum();
if (this.DataRead < 0)
this.DataRead = -1;
if (this.DataWritten < 0)
this.DataWritten = -1;
this.vertices = vertices;
this.Name = stagename;
this.AbandonedVertices =
this.FailedVertices =
this.CancelledVertices =
this.StartedVertices =
this.SuccessfulVertices =
this.CreatedVertices =
this.InvalidatedVertices =
this.RevokedVertices =
this.TotalInitiatedVertices = 0;
foreach (var vertex in this.vertices)
{
this.TotalInitiatedVertices++;
switch (vertex.State)
{
case ExecutedVertexInstance.VertexState.Revoked:
this.RevokedVertices++;
break;
case ExecutedVertexInstance.VertexState.Abandoned:
this.AbandonedVertices++;
break;
case ExecutedVertexInstance.VertexState.Invalidated:
this.InvalidatedVertices++;
break;
case ExecutedVertexInstance.VertexState.Cancelled:
this.CancelledVertices++;
break;
case ExecutedVertexInstance.VertexState.Failed:
this.FailedVertices++;
break;
case ExecutedVertexInstance.VertexState.Created:
this.CreatedVertices++;
break;
case ExecutedVertexInstance.VertexState.Started:
this.StartedVertices++;
break;
case ExecutedVertexInstance.VertexState.Successful:
this.SuccessfulVertices++;
break;
default:
throw new CalypsoDryadException("Unexpected vertex state " + vertex.State);
}
}
this.TotalInitiatedVertices -= this.AbandonedVertices;
if (this.TotalInitiatedVertices == 0)
return;
List<ExecutedVertexInstance> runVertices = this.vertices.Where(v => v.Start != DateTime.MinValue).ToList();
if (runVertices.Count > 0) this.StartTime = runVertices.Select(v => v.Start).Min();
this.EndTime = this.vertices.Select(v => v.End).Max();
}
}
/// <summary>
/// This class contains all the information about a Dryad job.
/// </summary>
public class DryadLinqJobInfo
{
/// <summary>
/// A summary of the job.
/// </summary>
DryadLinqJobSummary jobSummary;
/// <summary>
/// The start time of the job manager.
/// </summary>
public DateTime StartJMTime {
get
{
return this.ManagerVertex.Start;
}
}
/// <summary>
/// When was the job state last updated?
/// </summary>
public DateTime LastUpdatetime { get; private set; }
/// <summary>
/// The end time of the job.
/// </summary>
public DateTime EndTime {
get
{
return this.ManagerVertex.End;
}
}
/// <summary>
/// How long has the job been running?
/// </summary>
public TimeSpan RunningTime {
get
{
return this.ManagerVertex.RunningTime;
}
}
/// <summary>
/// All the vertices for the job.
/// </summary>
JobVertices jobVertices;
/// <summary>
/// When parsing stdout save here last vertex with failure, to attach additional
/// error messages to it.
/// </summary>
ExecutedVertexInstance lastFailedVertex;
/// <summary>
/// The path to the stdout of the job manager
/// </summary>
IClusterResidentObject stdoutpath;
/// <summary>
/// The name of the Job
/// </summary>
public string JobName { get { return this.Summary.Name; } }
/// <summary>
/// Error code of the dryadlinq job.
/// </summary>
public string ErrorCode { get; set; }
/// <summary>
/// The job manager vertex for this job.
/// </summary>
public ExecutedVertexInstance ManagerVertex { get; set; }
/// <summary>
/// Is the standard output complete or truncated?
/// </summary>
public bool ManagerStdoutIncomplete { get; protected set; }
/// <summary>
/// True if the information to create this jobinfo is no longer available.
/// </summary>
public bool JobInfoCannotBeCollected { get; protected set; }
/// <summary>
/// Number of stages that have started execution.
/// </summary>
public int ExecutedStageCount { get { return this.jobVertices.ExecutedStageCount; } }
/// <summary>
/// Total data read by job.
/// </summary>
public long TotalDataRead { get; protected set; }
/// <summary>
/// Data read intra-pod.
/// </summary>
public long IntraPodDataRead { get; protected set; }
/// <summary>
/// Data read cross pod.
/// </summary>
public long CrossPodDataRead { get; protected set; }
/// <summary>
/// Data read from the same machine.
/// </summary>
public long LocalReadData { get; protected set; }
/// <summary>
/// Approximate timing information, used for vertices which have not terminated yet.
/// </summary>
private DateTime lastTimestampSeen;
/// <summary>
/// The vertices in this job
/// </summary>
public IEnumerable<ExecutedVertexInstance> Vertices
{
get { return this.jobVertices.AllVertices(); }
}
/// <summary>
/// The summary of this job
/// </summary>
public DryadLinqJobSummary Summary
{
get { return this.jobSummary; }
}
/// <summary>
/// Message returned by job manager when job aborts.
/// </summary>
public string AbortingMsg { get; private set; }
/// <summary>
/// The cluster where the job information resides.
/// </summary>
public ClusterConfiguration ClusterConfiguration { get; protected set; }
/// <summary>
/// Original cluster configuration (the config can be just "cache").
/// </summary>
// ReSharper disable once UnusedAutoPropertyAccessor.Local
private ClusterConfiguration OriginalClusterConfiguration { get; set; }
/// <summary>
/// Regular expression for parsing a stdout line with vertex statistics.
/// </summary>
private readonly static Regex vertexStartRegex, vertexCreatedRegex, processCreatingRegex, timingInfoRegex, terminationRegex, verticesCreatedRegex, ioRegex,
terminatedRegex, vertexAbandonedRegex, failedRegex, cancelRegex, datareadRegex, inputFailureRegex, setToFailedlRegex, revokedRegex;
/// <summary>
/// Useful CPU time.
/// </summary>
public TimeSpan UsefulCPUTime { get; protected set; }
/// <summary>
/// Time spent in failed vertices.
/// </summary>
public TimeSpan WastedCPUTime { get; protected set; }
/// <summary>
/// Average degree of parallelism.
/// </summary>
public double AverageParallelism {
get
{
// ReSharper disable CompareOfFloatsByEqualityOperator
if (this.RunningTime.TotalSeconds != 0)
// ReSharper restore CompareOfFloatsByEqualityOperator
return this.UsefulCPUTime.TotalSeconds / this.RunningTime.TotalSeconds;
else
return 0;
}
}
/// <summary>
/// Number of executed vertices.
/// </summary>
public int ExecutedVertexCount { get { return this.jobVertices.Count; } }
/// <summary>
/// Compile a bunch of constant regular expressions.
/// </summary>
static DryadLinqJobInfo()
{
// optional guid regular expression
const string optGuidRegex = @"GUID \{?([-a-fA-F0-9]+)\}";
// Abandoning duplicate scheduling/execution of vertex 83.1 (InputTable__26[5])
vertexAbandonedRegex = new Regex(@"Abandoning duplicate \w+ of vertex (\d+)\.(\d+) \((.+)\)", RegexOptions.Compiled);
// Created process execution record for vertex 33 (Super__0[0]) v.0 GUID {B0FC788F-1FFC-4D74-AFC4-3EDFF03AF11A}
vertexCreatedRegex = new Regex(@"Created process execution record for vertex (\d+) \((.*)\) v.(\d+) " + optGuidRegex,
RegexOptions.Compiled);
// Creating process for vertex 2945 (Merge__17[440]) v.0 GUID {DDC9BB35-25D9-48A9-98C6-9EC7753FFB3B} machine sherwood-022
processCreatingRegex = new Regex(@"Creating process for vertex (\d+) \((.*)\) v.(\d+) " + optGuidRegex + @" machine (\w+)", RegexOptions.Compiled);
// Created process execution record for vertices 192 (Merge__41[0]) 223 (Union__45[0]) v.0 GUID {0297A91C-FFEA-42EA-94AF-CD0163A04D45}
verticesCreatedRegex = new Regex(@"Created process execution record for vertices (.*) v.(\d+) " + optGuidRegex,
RegexOptions.Compiled);
// Process started for vertex 5 (Super__0[1]) v.0 GUID {73EA55E0-0326-43C4-AD61-CB0B8CF8FE49} machine sherwood-025
// Process started for vertices 23 (Merge__29) 24 (Apply__33) v.0 GUID {E945DC5D-9AF6-4732-8770-2A6BF7FA3041} machine sherwood-237
vertexStartRegex = new Regex(@"Process started for vert(\w+) (.*) v\.(.*) " + optGuidRegex + @" machine (\S+)",
RegexOptions.Compiled);
// Timing Information 5 1 Super__0[1] 128654556602334453 0.0000 0.0000 0.0000 0.0000 0.2969
timingInfoRegex = new Regex(@"Timing Information (\d+) (\d+) (.+) (\d+) ([-.0-9]+) ([-.0-9]+) ([-.0-9]+) ([-.0-9]+) ([-.0-9]+)",
RegexOptions.Compiled);
// Vertex 5.0 (Super__0[1]) machine sherwood-025 guid {73EA55E0-0326-43C4-AD61-CB0B8CF8FE49} status Vertex Has Completed,
terminationRegex = new Regex(@"Vertex (\d+)\.(\d+) \((.+)\) machine (\S+) guid \{?([-a-fA-F0-9]+)\}? status (.*)",
RegexOptions.Compiled);
// Process was terminated Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 status The operation succeeded
terminatedRegex = new Regex(@"Process was terminated Vertex (\d+)\.(\d+) \((.+)\) " + optGuidRegex + @" machine (\S+) status (.*)",
RegexOptions.Compiled);
// Process has failed Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 Exitcode status The operation succeeded
failedRegex = new Regex(@"Process has failed Vertex (\d+)\.(\d+) \((.+)\) " + optGuidRegex + @" machine (\S+) Exitcode (.*)",
RegexOptions.Compiled);
// Canceling vertex 1461.0 (Merge__13[258]) due to dependent failure
cancelRegex = new Regex(@"Canceling vertex (\d+)\.(\d+) \((.+)\) due to (.*)", RegexOptions.Compiled);
// Setting vertex 1461.0 (Merge__13[258]) to failed
setToFailedlRegex = new Regex(@"Setting vertex (\d+)\.(\d+) \((.+)\) to failed(.*)", RegexOptions.Compiled);
// total=951722563162 local=37817665237 intrapod=189765117248 crosspod=724139780677
datareadRegex = new Regex(@"total=(\d+) local=(\d+) intrapod=(\d+) crosspod=(\d+)", RegexOptions.Compiled);
// Input vertex %u (%s) had %u read failure%s\n
inputFailureRegex = new Regex(@"Input vertex (\d+) \(.*\) had (\d+) read failure", RegexOptions.Compiled);
// Io information 23 1 Super__4[5] read 7106 wrote 933
ioRegex = new Regex(@"Io information (\d+) (\d+) (.+) read (\d+) wrote (\d+)");
// Cancellations by Quincy
revokedRegex = new Regex(@"Process was revoked by remote scheduler Old " + optGuidRegex + @" New " + optGuidRegex);
}
/// <summary>
/// Create information about a job run on the cluster.
/// </summary>
/// <param name="cf">Cluster configuration.</param>
/// <param name="summary">Summary description of the job.</param>
/// <returns>The Dryad job description, or null.</returns>
/// <param name="reporter">Delegate used to report errors.</param>
/// <param name="fill">If true, fill all the information, otherwise the user will have to call FillInformation on the result later.</param>
/// <param name="updateProgress">Delegate used to report progress.</param>
public static DryadLinqJobInfo CreateDryadLinqJobInfo(ClusterConfiguration cf, DryadLinqJobSummary summary, bool fill, StatusReporter reporter, Action<int> updateProgress)
{
try
{
DryadLinqJobInfo job = new DryadLinqJobInfo(cf, summary);
if (fill)
job.CollectEssentialInformation(reporter, updateProgress);
return job;
}
catch (Exception e)
{
Trace.TraceInformation(e.ToString());
reporter("Could not collect job information for " + summary.Name + ": " + e.Message, StatusKind.Error);
return null;
}
}
/// <summary>
/// Read the information about a job which ran the JM on the cluster
/// </summary>
/// <param name="cf">Configuration of the cluster.</param>
/// <param name="summary">Summary of the job.</param>
protected DryadLinqJobInfo(ClusterConfiguration cf, DryadLinqJobSummary summary)
{
this.JobInfoCannotBeCollected = true;
this.ClusterConfiguration = cf;
if (cf is CacheClusterConfiguration)
this.OriginalClusterConfiguration = (cf as CacheClusterConfiguration).ActualConfig(summary);
else
this.OriginalClusterConfiguration = cf;
this.Initialize(summary);
}
/// <summary>
/// Initialize a job info.
/// </summary>
/// <param name="summary">Job to summarize.</param>
private void Initialize(DryadLinqJobSummary summary)
{
this.UsefulCPUTime = TimeSpan.Zero;
this.WastedCPUTime = TimeSpan.Zero;
this.LastUpdatetime = DateTime.Now;
this.ManagerStdoutIncomplete = true; // until we've seen the end
this.ManagerVertex = null;
this.jobSummary = summary;
this.ErrorCode = "";
this.AbortingMsg = "";
this.cachedStages = new Dictionary<string, DryadLinqJobStage>();
this.jobVertices = new JobVertices();
bool terminated = ClusterJobInformation.JobIsFinished(summary.Status);
IClusterResidentObject managerstdoutfile = this.ClusterConfiguration.ProcessStdoutFile(summary.ManagerProcessGuid, terminated, summary.Machine, summary);
if (this.ClusterConfiguration is CacheClusterConfiguration)
this.stdoutpath = managerstdoutfile;
else
{
IClusterResidentObject jmdir = this.ClusterConfiguration.ProcessDirectory(summary.ManagerProcessGuid, terminated, summary.Machine, summary);
if (this.stdoutpath == null)
{
string filename = managerstdoutfile.Name;
//this.stdoutpath = jmdir.GetFile("stdout.txt");
// do this by scanning the folder; this can give additional information about the file size on some platforms
IEnumerable<IClusterResidentObject> files = jmdir.GetFilesAndFolders(filename);
foreach (var f in files)
{
if (f.Exception != null)
{
throw f.Exception;
}
if (f.RepresentsAFolder)
continue;
// there should be exactly one match
this.stdoutpath = f;
break;
}
if (this.stdoutpath == null)
{
throw new CalypsoClusterException("Could not locate JM standard output file in folder " + jmdir);
}
}
}
}
/// <summary>
/// Refresh the job status.
/// </summary>
/// <param name="reporter">Delegate used to report errors.</param>
/// <param name="updateProgress">Used to report progress.</param>
public void RefreshJobStatus(StatusReporter reporter, Action<int> updateProgress)
{
// skip if job is finished
if (this.Summary.Status == ClusterJobInformation.ClusterJobStatus.Failed ||
this.Summary.Status == ClusterJobInformation.ClusterJobStatus.Cancelled ||
this.Summary.Status == ClusterJobInformation.ClusterJobStatus.Succeeded)
return;
ClusterStatus status = this.ClusterConfiguration.CreateClusterStatus();
status.RefreshStatus(this.Summary, reporter, updateProgress);
}
/// <summary>
/// Fill the job info by parsing the stdout.txt.
/// <param name="statusReporter">Delegate used to report errors.</param>
/// <returns>True if it succeeds, false otherwise.</returns>
/// <param name="updateProgress">Delegate used to report progress.</param>
/// </summary>
public bool CollectEssentialInformation(StatusReporter statusReporter, Action<int> updateProgress)
{
this.RefreshJobStatus(statusReporter, updateProgress);
if (this.ManagerVertex == null)
{
this.ManagerVertex = new ExecutedVertexInstance(this, -1, 0, "JobManager", "", this.Summary.Date);
this.ManagerVertex.IsManager = true;
this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, this.Summary.Date, this.Summary.ManagerProcessGuid, "");
this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = this.Summary.Date;
ExecutedVertexInstance.VertexState jmstate = ExecutedVertexInstance.VertexState.Started;
switch (this.Summary.Status)
{
case ClusterJobInformation.ClusterJobStatus.Failed:
jmstate = ExecutedVertexInstance.VertexState.Failed;
break;
/*
case ClusterJobInformation.ClusterJobStatus.Succeeded:
jmstate = ExecutedVertexInstance.VertexState.Successful;
break;
*/
}
this.ManagerVertex.SetState(jmstate);
this.jobVertices.Add(this.ManagerVertex);
}
if (this.stdoutpath == null)
return false;
bool success = this.ParseStdout(this.stdoutpath, statusReporter, updateProgress);
updateProgress(100);
if (!success)
return false;
this.JobInfoCannotBeCollected = false;
statusReporter("Stdout parsed", StatusKind.OK);
this.LastUpdatetime = DateTime.Now;
if (this.Summary.Status == ClusterJobInformation.ClusterJobStatus.Running)
{
foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started))
vertex.MarkVertexWasRunning(this.LastUpdatetime);
this.ManagerVertex.MarkVertexWasRunning(this.LastUpdatetime);
}
else if (this.jobSummary.Status == ClusterJobInformation.ClusterJobStatus.Failed)
{
if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Started)
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed);
foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started))
vertex.MarkVertexWasRunning(this.ManagerVertex.End);
}
return true;
}
/// <summary>
/// Given a list of vertex (number \(name\))* pairs return the numbers.
/// </summary>
/// <param name="vertexlist">A string of shape (number \(name\))*.</param>
/// <returns>The list of name-number pairs.</returns>
private static IEnumerable<Tuple<string, int>> ParseVertices(string vertexlist)
{
Regex numberre = new Regex(@"(\d+) (.*)");
while (vertexlist.Length > 0)
{
Match m = numberre.Match(vertexlist);
if (!m.Success)
throw new CalypsoDryadException("Could not find vertex number in " + vertexlist);
string number = m.Groups[1].Value;
// now scan a balanced number of parantheses
string rest = m.Groups[2].Value;
if (rest[0] != '(')
throw new CalypsoDryadException("Expecting open parens after vertex number");
int opened = 0;
int i;
for (i = 0; i < rest.Length; i++)
{
if (rest[i] == '(')
opened++;
else if (rest[i] == ')')
{
opened--;
if (opened == 0)
{
i++;
break;
}
}
}
if (opened != 0 || i <= 2)
throw new CalypsoDryadException("did not find matched parantheses in vertex name in " + vertexlist + ", can't parse");
string name = rest.Substring(1, i - 2); // skip first and last paranthesis
yield return new Tuple<string, int>(name, int.Parse(number));
vertexlist = rest.Substring(i);
}
}
/// <summary>
/// In new versions of L2H some lines start with a timestamp. Parse this timestamp.
/// </summary>
/// <param name="line">Line that may start with [timestamp].</param>
/// <returns>The timestamp at the beginning of the line, or DateTime.MinValue if none.</returns>
static DateTime ParseLineTimestamp(string line)
{
int square = line.IndexOf(']');
DateTime result = DateTime.MinValue;
if (line.StartsWith("[") && square >= 1)
{
string datetime = line.Substring(1, square-1);
DateTime.TryParse(datetime, out result);
}
return result;
}
/// <summary>
/// Try to read a numeric value from a dictionary at a specific key.
/// </summary>
/// <param name="dict">Dictionary containing key-value pairs.</param>
/// <param name="key">Key we are interested in.</param>
/// <returns>The numeric value with that key, or 0 if some error occurs.</returns>
private long TryGetNumeric(Dictionary<string, string> dict, string key)
{
if (!dict.ContainsKey(key)) return 0;
long result;
if (long.TryParse(dict[key], out result))
return result;
return 0;
}
/// <summary>
/// New JM stdout parsing code, for YARN-based DryadLINQ.
/// </summary>
/// <param name="line">Line to parse.</param>
/// <returns>False if the line terminated in a quoted string and has to be combined with the next line.</returns>
private bool ParseStdoutLineNew(string line)
{
if (string.IsNullOrWhiteSpace(line)) return true;
Dictionary<string, string> kvp = Utilities.ParseCSVKVP(line);
if (kvp == null) return false;
var strTs = kvp["logtimelocal"];
int cutOff = strTs.IndexOf("UTC");
if (cutOff >= 0)
{
strTs = strTs.Substring(0, cutOff);
}
DateTime timeStamp = DateTime.Parse(strTs, CultureInfo.InvariantCulture);
timeStamp = timeStamp.ToLocalTime();
this.lastTimestampSeen = timeStamp;
if (kvp.ContainsKey("job"))
{
string operation = kvp["job"];
switch (operation)
{
case "start":
this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, timeStamp, this.Summary.ManagerProcessGuid, "");
this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = timeStamp;
break;
case "stop":
this.ManagerVertex.End = timeStamp;
string exitcode;
if (kvp.TryGetValue("exitcode", out exitcode))
{
this.ErrorCode = exitcode;
int numCode = Convert.ToInt32(exitcode, 16);
if (numCode == 0)
{
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Successful);
}
else
{
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed);
}
}
string errorstring;
if (kvp.TryGetValue("errorstring", out errorstring))
{
this.ManagerVertex.AddErrorString(errorstring);
this.AbortingMsg = errorstring;
}
break;
}
}
else if (kvp.ContainsKey("vertex"))
{
string vertex = kvp["vertex"];
int number;
int version;
int dot = vertex.IndexOf('.');
if (dot < 0)
{
number = int.Parse(vertex);
version = int.Parse(kvp["version"]);
}
else
{
number = int.Parse(vertex.Substring(0, dot));
version = int.Parse(vertex.Substring(dot + 1));
}
if (kvp.ContainsKey("transition"))
{
string transition = kvp["transition"];
switch (transition)
{
case "created":
{
string name = kvp["name"];
ExecutedVertexInstance vi = new ExecutedVertexInstance(this, number, version, name, "", timeStamp);
this.jobVertices.Add(vi);
}
break;
case "starting":
{
// not doing anything
break;
}
case "running":
{
string process;
kvp.TryGetValue("id", out process); // "process" is also good
string machine = kvp["computer"];
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
this.jobVertices.Remap(vi, process);
string pid = this.ClusterConfiguration.ExtractPidFromGuid(process, this.Summary);
DryadProcessIdentifier identifier = new DryadProcessIdentifier(pid);
vi.SetStartInformation(this, machine, timeStamp, identifier, process);
}
break;
case "completed":
{
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
vi.SetState(ExecutedVertexInstance.VertexState.Successful);
vi.End = timeStamp;
vi.ExitCode = "";
break;
}
case "failed":
{
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi.State != ExecutedVertexInstance.VertexState.Started)
vi.SetState(ExecutedVertexInstance.VertexState.Cancelled);
else
vi.SetState(ExecutedVertexInstance.VertexState.Failed);
if (kvp.ContainsKey("errorstring"))
vi.AddErrorString(kvp["errorstring"]);
string exitcode;
if (kvp.TryGetValue("errorcode", out exitcode))
vi.ExitCode = exitcode;
vi.End = timeStamp;
break;
}
}
}
else if (kvp.ContainsKey("outputChannel"))
{
string chan = kvp["outputChannel"];
int channelNo = int.Parse(chan);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (!kvp.ContainsKey("errorstatus"))
{
}
else
{
if (kvp.ContainsKey("errorstring"))
vi.AddErrorString(kvp["errorstring"]);
}
}
else if (kvp.ContainsKey("inputChannel"))
{
string chan = kvp["inputChannel"];
int channelNo = int.Parse(chan);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (!kvp.ContainsKey("errorstatus"))
{
}
else
{
vi.AddErrorString(kvp["errorstring"]);
}
}
else if (kvp.ContainsKey("io"))
{
if (kvp["io"] == "starting")
{
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
int numberOfInputs = (int) TryGetNumeric(kvp, "numberOfInputs");
int numberOfOutputs = (int)TryGetNumeric(kvp, "numberOfOutputs");
if (vi.InputChannels == null)
vi.InputChannels = new Dictionary<int, ChannelEndpointDescription>();
for (int i = 0; i < numberOfInputs; i++)
{
string uri;
if (kvp.TryGetValue("uriIn." + i, out uri))
{
var ched = new ChannelEndpointDescription(false, i, uri, 0);
vi.InputChannels[i] = ched;
}
}
if (vi.OutputChannels == null)
vi.OutputChannels = new Dictionary<int, ChannelEndpointDescription>();
for (int i = 0; i < numberOfOutputs; i++)
{
string uri;
if (kvp.TryGetValue("uriOut." + i, out uri))
{
var ched = new ChannelEndpointDescription(false, i, uri, 0);
vi.OutputChannels[i] = ched;
}
}
}
else if (kvp["io"] == "total")
{
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
long totalRead = TryGetNumeric(kvp, "totalRead");
long tempRead = TryGetNumeric(kvp, "tempRead");
long tempReadInRack = TryGetNumeric(kvp, "tempReadInRack");
long tempReadCrossRack = TryGetNumeric(kvp, "tempReadCrossRack");
long localRead = TryGetNumeric(kvp, "localRead");
long totalWritten = TryGetNumeric(kvp, "totalWritten");
vi.DataRead = totalRead;
vi.DataWritten = totalWritten;
this.TotalDataRead += totalRead;
this.LocalReadData += localRead;
this.CrossPodDataRead += tempReadCrossRack;
this.IntraPodDataRead += tempReadInRack;
}
else if (kvp["io"] == "running")
{
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
foreach (int ch in vi.InputChannels.Keys)
{
long bytes = TryGetNumeric(kvp, "rb." + ch);
vi.InputChannels[ch].Size = bytes;
}
foreach (int ch in vi.OutputChannels.Keys)
{
long bytes = TryGetNumeric(kvp, "wb." + ch);
vi.OutputChannels[ch].Size = bytes;
}
long totalRead = TryGetNumeric(kvp, "totalRead");
long totalWritten = TryGetNumeric(kvp, "totalWritten");
vi.DataRead = totalRead;
vi.DataWritten = totalWritten;
}
}
}
return true;
}
/// <summary>
/// Parse one line from the JM standard output.
/// </summary>
/// <param name="line">The line to parse.</param>
private void ParseStdoutLine(string line)
{
DateTime lineTimeStamp = DateTime.MinValue;
if (line.Contains("Created process execution record"))
{
Match m = vertexCreatedRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
// Created process execution record for vertex (\d+) \((.*)\) v.(\d+) GUID \{?([-A-F0-9]+)\}?
int number = Int32.Parse(m.Groups[1].Value);
string name = m.Groups[2].Value;
int version = Int32.Parse(m.Groups[3].Value);
string guid = m.Groups[4].Value; // on some platforms, e.g. HPC, this identifier is not yet assigned properly
// the vertex may be already there, sometimes numbers are reused...
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi == null)
{
vi = new ExecutedVertexInstance(this, number, version, name, guid, lineTimeStamp);
this.jobVertices.Add(vi);
}
}
else
{
m = verticesCreatedRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
// Created process execution record for vertices (.*) v.(\d+) GUID \{?([-A-F0-9]+)\}?
// Created process execution record for vertices 192 (Merge__41[0]) 223 (Union__45[0]) v.0 GUID {0297A91C-FFEA-42EA-94AF-CD0163A04D45}
int version = Int32.Parse(m.Groups[2].Value);
string vertices = m.Groups[1].Value;
string guid = m.Groups[3].Value; // on some platforms, e.g. HPC, this identifier is not yet assigned properly
IEnumerable<Tuple<string, int>> vertexList = DryadLinqJobInfo.ParseVertices(vertices);
foreach (var p in vertexList)
{
int number = p.Item2;
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi == null)
{
vi = new ExecutedVertexInstance(this, number, version, p.Item1, guid, lineTimeStamp);
this.jobVertices.Add(vi);
}
}
}
}
}
else if (line.StartsWith("Creating process"))
{
Match m = processCreatingRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
// Creating process for vertex (\d+) \((.*)\\) v.(\d+) GUID \{?([-A-F0-9]+)\}? machine (\w+)
int number = Int32.Parse(m.Groups[1].Value);
//string name = m.Groups[2].Value;
int version = Int32.Parse(m.Groups[3].Value);
string guid = m.Groups[4].Value;
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi != null)
{
this.jobVertices.Remap(vi, guid);
}
}
}
else if (line.StartsWith("Process was revoked"))
{
Match m = revokedRegex.Match(line);
if (m.Success)
{
string oldGuid = m.Groups[1].Value;
ExecutedVertexInstance vi = this.jobVertices.FindVertexByGuid(oldGuid);
if (vi != null)
{
vi.SetState(ExecutedVertexInstance.VertexState.Revoked);
string newGuid = m.Groups[2].Value;
this.jobVertices.Remap(vi, newGuid);
}
else
{
Trace.TraceInformation("Could not find revoked vertex with guid " + oldGuid);
}
}
}
else if (line.StartsWith("---HiPriTime"))
{
// Scope-specific line which we use to get the i/o information
// ---HiPriTime D7D51A1F-6693-4378-95FD-FC778A67C632,F52CA694-0202-411E-85E9-0C883E770A0E,SV4_Extract_Split[0],Completed,ch1sch010331112,2011-05-03 15:26:01.681 PDT,2011-05-03 15:26:01.696 PDT,2011-05-03 15:26:02.118 PDT,2011-05-03 15:26:04.286 PDT,2011-05-03 15:26:07.656 PDT,2011-05-03 15:26:01.696 PDT,97390825,1498630
string info = line.Substring(13);
string[] parts = info.Split(',');
if (parts.Length >= 13)
{
long read = long.Parse(parts[11]);
long written = long.Parse(parts[12]);
string guid = parts[1];
ExecutedVertexInstance vi = this.jobVertices.FindVertexByGuid(guid);
if (vi != null)
{
vi.DataRead = read;
vi.DataWritten = written;
this.TotalDataRead += read;
}
}
}
else if (line.Contains("Io information"))
{
// HPC-specific line
Match m = ioRegex.Match(line);
if (m.Success)
{
int number = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi != null)
{
vi.DataRead = long.Parse(m.Groups[4].Value);
vi.DataWritten = long.Parse(m.Groups[5].Value);
this.TotalDataRead += vi.DataRead;
}
}
}
else if (line.Contains("Process started"))
{
//those vertices which are being canceled may not be here
Match m = vertexStartRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
string version = m.Groups[3].Value;
string guid = m.Groups[4].Value;
string pid = this.ClusterConfiguration.ExtractPidFromGuid(guid, this.Summary);
DryadProcessIdentifier identifier = new DryadProcessIdentifier(pid);
string machine = m.Groups[5].Value;
// Process started for vertex 4 (Super__0[0]) v.0 GUID {9DDD0B00-C93F-46D2-9073-1CFD27829300} machine sherwood-255
// Process started for vertices 23 (Merge__29) 24 (Apply__33) v.0 GUID {E945DC5D-9AF6-4732-8770-2A6BF7FA3041} machine sherwood-237
string vertices = m.Groups[2].Value;
// This is a list of (number \(name\))* pairs
// we will assume that the parantheses are matched, or we can't do much
bool onevertex;
if (m.Groups[1].Value == "ex") // one vertEX
onevertex = true;
else if (m.Groups[1].Value == "ices")
onevertex = false;
else
throw new CalypsoDryadException("Can't figure out if one or many vertices");
IEnumerable<Tuple<string, int>> vertexList = DryadLinqJobInfo.ParseVertices(vertices);
int vertexcount = 0;
int iversion = int.Parse(version);
if (lineTimeStamp > this.lastTimestampSeen)
this.lastTimestampSeen = lineTimeStamp;
foreach (var p in vertexList)
{
int number = p.Item2;
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, iversion);
//new ExecutedVertexInstance(this, number, version, name, identifier, machine, this.lastTimestampSeen);
if (vi == null)
Trace.TraceInformation("Could not find information for vertex {0}.{1}", number, version);
else
vi.SetStartInformation(this, machine, this.lastTimestampSeen, identifier, guid);
vertexcount++;
}
if (vertexcount > 1 && onevertex)
throw new CalypsoDryadException("Expected one vertex, found " + vertexcount);
}
else
{
Trace.TraceInformation("Unexpected parsing error on line {0}", line);
}
}
else if (line.Contains("Abandoning"))
{
Match m = vertexAbandonedRegex.Match(line);
if (m.Success)
{
int number = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi != null)
vi.SetState(ExecutedVertexInstance.VertexState.Abandoned);
}
}
else if (line.Contains("Setting"))
{
Match m = setToFailedlRegex.Match(line);
if (m.Success)
{
// Setting vertex 1461.0 (Merge__13[258]) to failed
// Setting vertex (\d+)\.(\d+) \((.+)\) to failed(.*)
int number = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi != null)
{
vi.SetState(ExecutedVertexInstance.VertexState.Failed);
//vi.ErrorString = m.Groups[4].Value;
}
}
}
else if (line.Contains("Process was terminated"))
{
// terminatedRegex = new Regex(@"Process was terminated Vertex (\d+)\.(\d+) \((.+)\) GUID \{?([-A-F0-9]+)\}? machine (\S+) status (.*)",
// Process was terminated Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 status The operation succeeded
Match m = terminatedRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
int number = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version);
if (vi != null)
{
// sometimes successful processes are terminated, because they don't report quickly enough being done
if (vi.State != ExecutedVertexInstance.VertexState.Successful)
{
vi.SetState(ExecutedVertexInstance.VertexState.Cancelled);
}
vi.ErrorString = m.Groups[6].Value;
if (lineTimeStamp != DateTime.MinValue)
vi.End = lineTimeStamp;
}
}
}
else if (line.Contains("Timing Information Graph Start Time"))
{
// Cosmos-specific line
// Timing Information Graph Start Time 128654556581866096
Match m = Regex.Match(line, @"Timing Information Graph Start Time (\d+)");
DateTime createTime = Utilities.Convert64time(ClusterConfiguration.GetClusterTimeZone(this.Summary), m.Groups[1].Value);
this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, createTime, this.Summary.ManagerProcessGuid, "");
this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = createTime;
this.lastTimestampSeen = createTime;
}
else if (line.StartsWith("Start time: "))
{
// HPC L2H specific line
// Start time: 04/05/2011 17:25:42.223
DateTime createTime;
bool parse = DateTime.TryParse(line.Substring("Start time: ".Length), out createTime);
if (parse)
{
this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, createTime, this.Summary.ManagerProcessGuid, "");
this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = createTime;
this.lastTimestampSeen = createTime;
}
}
else if (line.Contains("JM Finish time:"))
{
// Cosmos-specific line
// JM Finish time: 129140295499437263 2010-03-25T22:25:49.943726Z
Match m = Regex.Match(line, @"JM Finish time: (\d+)");
DateTime time = Utilities.Convert64time(ClusterConfiguration.GetClusterTimeZone(this.Summary), m.Groups[1].Value);
this.lastTimestampSeen = time;
this.ManagerVertex.End = time;
}
else if (line.StartsWith("Stop time "))
{
// HPC L2H specific line
// Stop time (Exit code = 2148734208): 04/05/2011 17:25:46.614
Regex regex = new Regex(@"Stop time \(Exit code = (.*)\): (.*)");
Match m = regex.Match(line);
if (m.Success)
{
this.ManagerStdoutIncomplete = false;
DateTime time;
bool parse = DateTime.TryParse(m.Groups[2].Value, out time);
if (parse)
{
this.lastTimestampSeen = time;
this.ManagerVertex.End = time;
}
this.ErrorCode = m.Groups[1].Value;
if (this.ErrorCode == "0")
{
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Successful);
}
else
{
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed);
}
}
}
else if (line.Contains("Timing Information"))
{
// Timing Information 4 1 Super__0[0] 128654556603428182 0.0000 0.0000 0.0000 0.0000 0.2500
Match m = timingInfoRegex.Match(line);
if (m.Success)
{
int vertex = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
DateTime createtime = Utilities.Convert64time(ClusterConfiguration.GetClusterTimeZone(this.Summary), m.Groups[4].Value);
ExecutedVertexInstance vi = jobVertices.FindVertex(vertex, version);
if (vi == null)
return; // we do not keep track of vertices with duplicate scheduling, so these won't show up here
if (vi.State == ExecutedVertexInstance.VertexState.Started)
{
Console.WriteLine("Timing information while vertex is still running " + vi);
//throw new CalypsoClusterException("Timing information for vertex still running: " + vi);
}
DateTime last = vi.SetTiming(createtime, m.Groups[5].Value, m.Groups[6].Value, m.Groups[7].Value, m.Groups[8].Value, m.Groups[9].Value);
if (last > this.lastTimestampSeen)
this.lastTimestampSeen = last;
this.ManagerVertex.MarkVertexWasRunning(last);
try
{
if (vi.State == ExecutedVertexInstance.VertexState.Successful)
this.UsefulCPUTime += vi.RunningTime;
else if (vi.RunningTime > TimeSpan.Zero)
this.WastedCPUTime += vi.RunningTime;
}
catch (Exception ex)
{
Console.WriteLine("Time value exception: " + ex.Message);
}
}
else
throw new CalypsoDryadException("Unmatched timing information line " + line);
}
else if (line.Contains("Process has failed"))
{
// Process has failed Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 Exitcode 0 status The operation succeeded
// failedRegex = new Regex(@"Process has failed Vertex (\d+)\.(\d+) \((.+)\) GUID \{?([-A-F0-9]+)\}? machine (\S+) Exitcode (.*)",
Match m = failedRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
int vertex = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
string exitcode = m.Groups[6].Value;
//string status = m.Groups[7].Value;
ExecutedVertexInstance vi = jobVertices.FindVertex(vertex, version);
if (vi != null)
{
vi.SetState(ExecutedVertexInstance.VertexState.Failed);
vi.ExitCode = exitcode;
if (lineTimeStamp != DateTime.MinValue)
vi.End = lineTimeStamp;
//vi.ErrorString = status;
}
}
}
else if (line.Contains("ABORTING:"))
{
this.AbortingMsg = line.Substring(10);
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed);
}
else if (line.Contains("Accurate read data"))
{
Match m = datareadRegex.Match(line);
if (m.Success)
{
this.TotalDataRead = long.Parse(m.Groups[1].Value);
this.LocalReadData = long.Parse(m.Groups[2].Value);
this.IntraPodDataRead = long.Parse(m.Groups[3].Value);
this.CrossPodDataRead = long.Parse(m.Groups[4].Value);
}
}
else if (line.Contains("<ErrorString>"))
{
//some errors contains "Error returned from managed runtime invocation"
//which shows the error is from application code
Match m = Regex.Match(line, @"\<ErrorString\>(.*)\</ErrorString\>");
if (m.Success && lastFailedVertex != null)
{
lastFailedVertex.AddErrorString(System.Web.HttpUtility.HtmlDecode(m.Groups[1].Value));
}
}
else if (line.Contains("Canceling"))
{
// Canceling vertex 1461.0 (Merge__13[258]) due to dependent failure
Match m = cancelRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
int vertex = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
string name = m.Groups[3].Value;
ExecutedVertexInstance vi = jobVertices.FindVertex(vertex, version);
if (vi != null)
{
if (vi.State == ExecutedVertexInstance.VertexState.Successful)
vi.SetState(ExecutedVertexInstance.VertexState.Invalidated);
else
vi.SetState(ExecutedVertexInstance.VertexState.Cancelled);
if (lineTimeStamp != DateTime.MinValue)
vi.End = lineTimeStamp;
}
else
{
// TODO: this should not be needed, but this is a workaround for a bug in the HPC L2H software
vi = new ExecutedVertexInstance(this, vertex, version, name, "", lineTimeStamp);
vi.SetState(ExecutedVertexInstance.VertexState.Cancelled);
this.jobVertices.Add(vi);
}
// Process wasn't even started, so there is nothing to cancel
}
}
else if (line.Contains("Application"))
{
//the job ends successfully
Regex endSuccessRegex = new Regex(@"Application completed successfully.");
//the job failed
Regex endFailRegex = new Regex(@"Application failed with error code (.*)");
Match m1 = endFailRegex.Match(line);
if (m1.Success)
{
this.ErrorCode = m1.Groups[1].Value;
this.ManagerStdoutIncomplete = false;
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed);
}
else
{
Match m2 = endSuccessRegex.Match(line);
if (m2.Success)
{
this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Successful);
this.ManagerStdoutIncomplete = false;
}
}
}
else if (line.StartsWith("Input"))
{
// Input vertex %u (%s) had %u read failure%s\n
Match m = inputFailureRegex.Match(line);
if (m.Success)
{
this.AbortingMsg = line;
}
}
else if (line.Contains("Vertex"))
{
// terminationRegex = new Regex(@"Vertex (\d+)\.(\d+) \((.+)\) machine (\S+) guid \{?([-0-9A-F]+)\}? status (.*)"
Match m = terminationRegex.Match(line);
if (m.Success)
{
lineTimeStamp = ParseLineTimestamp(line);
int vertex = Int32.Parse(m.Groups[1].Value);
int version = Int32.Parse(m.Groups[2].Value);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(vertex, version);
if (vi == null)
{
Trace.TraceInformation("Could not find vertex {0}.{1} line {2}", vertex, version, line);
}
else
{
bool failed = vi.SetTermination(m.Groups[6].Value, lineTimeStamp);
if (failed)
this.lastFailedVertex = vi;
}
}
}
if (lineTimeStamp != DateTime.MinValue)
this.lastTimestampSeen = lineTimeStamp;
}
private Dictionary<string, DryadLinqJobStage> cachedStages;
/// <summary>
/// Get information about a particular stage.
/// </summary>
/// <param name="stagename">Name of stage sought.</param>
/// <returns>A description of the stage in question, or null if there are no vertices in that stage.</returns>
public DryadLinqJobStage GetStage(string stagename)
{
if (this.cachedStages.ContainsKey(stagename))
return this.cachedStages[stagename];
List<ExecutedVertexInstance> stageVertices = this.jobVertices.GetStageVertices(stagename);
if (stageVertices == null)
stageVertices = new List<ExecutedVertexInstance>();
DryadLinqJobStage retval = new DryadLinqJobStage(stagename, stageVertices);
this.cachedStages.Add(stagename, retval);
return retval;
}
private ISharedStreamReader cachedStdoutReader = null;
/// <summary>
/// Remember how many lines were parsed, and skip them on a second invocation.
/// </summary>
private int stdoutLinesParsed;
/// <summary>
/// Parse the stdout.txt file from the job manager.
/// </summary>
/// <param name="file">File to parse.</param>
/// <param name="statusReporter">Delegate used to report errors.</param>
/// <param name="updateProgress">Delegate used to report progress.</param>
/// <returns>True if the parsing succeeds.</returns>
private bool ParseStdout(IClusterResidentObject file, StatusReporter statusReporter, Action<int> updateProgress)
{
int currentLine = 0;
if (this.stdoutLinesParsed == 0)
// don't lose it if we are only parsing the tail.
this.lastTimestampSeen = this.Summary.Date; // start from the job submission timestamp
// we are reusing the stream
this.stdoutLinesParsed = 0;
try
{
long filesize = file.Size;
long readbytes = 0;
string message = "Scanning JM stdout " + file;
if (filesize >= 0)
message += string.Format("({0:N0} bytes)", filesize);
statusReporter(message, StatusKind.LongOp);
if (this.cachedStdoutReader == null)
this.cachedStdoutReader = file.GetStream();
if (this.cachedStdoutReader.Exception != null)
{
statusReporter("Exception while opening stdout " + this.cachedStdoutReader.Exception.Message, StatusKind.Error);
return false;
}
while (!this.cachedStdoutReader.EndOfStream)
{
string line = this.cachedStdoutReader.ReadLine();
readbytes += line.Length;
if (currentLine >= this.stdoutLinesParsed)
{
while (true)
{
int startLine = currentLine;
bool completeLine = true;
try
{
completeLine = this.ParseStdoutLineNew(line);
}
catch (Exception ex)
{
statusReporter(string.Format("Line {0}: Exception {1}", currentLine, ex.Message), StatusKind.Error);
Console.WriteLine("Line {0}: Exception {1}", currentLine, ex);
}
if (!completeLine)
{
if (this.cachedStdoutReader.EndOfStream)
{
throw new Exception("File ended while scanning for closing quote started on line " + startLine);
}
string extraline = this.cachedStdoutReader.ReadLine();
line += "\n" + extraline;
currentLine++;
}
else break;
}
}
currentLine++;
if (currentLine % 100 == 0 && filesize > 0)
{
updateProgress(Math.Min(100, (int)(100 * readbytes / filesize)));
}
}
this.stdoutLinesParsed = currentLine;
if (this.ManagerVertex != null)
{
if (this.ManagerVertex.End == DateTime.MinValue)
// approximation
this.ManagerVertex.End = this.lastTimestampSeen;
// we are done with this stream
if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Failed ||
this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Successful)
this.cachedStdoutReader.Close();
}
return true;
}
catch (Exception e)
{
statusReporter("Exception while reading stdout " + e.Message, StatusKind.Error);
Trace.TraceInformation(e.ToString());
return false;
}
}
/// <summary>
/// How many log files were successfuly parsed.
/// </summary>
private int logFilesParsed;
/// <summary>
/// Parse the logs generated by the Job Manager and learn more information from them.
/// This function should be called after parsing the stdout.
/// This function is extremely slow; it may be invoked on a background thread.
/// <param name="statusReporter">Delegate used to report errors.</param>
/// <returns>True on success.</returns>
/// <param name="updateProgress">Delegate used to report progress.</param>
/// </summary>
public bool ParseJMLogs(StatusReporter statusReporter, Action<int> updateProgress)
{
IClusterResidentObject dir = this.ClusterConfiguration.ProcessLogDirectory(this.Summary.ManagerProcessGuid, this.ManagerVertex.VertexIsCompleted, this.Summary.Machine, this.Summary);
if (dir.Exception != null)
{
statusReporter("Exception finding logs in " + dir, StatusKind.Error);
return false;
}
string pattern = this.ClusterConfiguration.JMLogFilesPattern(false, this.Summary);
List<IClusterResidentObject> logfiles = dir.GetFilesAndFolders(pattern).ToList();
long totalWork = 0;
foreach (var logfile in logfiles)
{
if (logfile.Size >= 0 && totalWork > 0)
totalWork += logfile.Size;
else
totalWork = -1;
}
bool success = true;
statusReporter(string.Format("Parsing {0} log files", logfiles.Count - this.logFilesParsed), StatusKind.OK);
int currentFile = 0;
bool invalidateCache = false;
foreach (var logfile in logfiles)
{
if (currentFile >= this.logFilesParsed)
{
invalidateCache = true;
success = this.ParseJMLogFile(logfile, statusReporter);
}
if (!success)
// stop at first failure
break;
currentFile++;
updateProgress(100 * currentFile / logfiles.Count);
}
updateProgress(100);
if (invalidateCache)
this.InvalidateCaches();
// reparse the last one again
this.logFilesParsed = currentFile - 1;
return success;
}
/// <summary>
/// Parse a log file of the job manager and extract useful information.
/// </summary>
/// <param name="logfile">Log file to parse.</param>
/// <param name="statusReporter">Delegate used to parse errors.</param>
/// <returns>True if parsing succeeds.</returns>
internal bool ParseJMLogFile(IClusterResidentObject logfile, StatusReporter statusReporter)
{
bool success = true;
ISharedStreamReader sr = logfile.GetStream();
if (sr.Exception != null)
{
statusReporter("Exception while opening file " + logfile + ":" + sr.Exception.Message, StatusKind.Error);
return false;
}
while (!sr.EndOfStream)
{
string line = sr.ReadLine();
if (!line.Contains("DryadProfiler")) continue;
CosmosLogEntry le = new CosmosLogEntry(line);
if (le.Subsystem != "DryadProfiler") continue;
if (!le.Message.EndsWith("channel status")) continue;
Dictionary<string, string> kvp = Utilities.ParseCommaSeparatedKeyValuePair(le.ExtraInfo);
string verver = kvp["Vertex"];
string[] numbers = verver.Split('.');
int vertex = int.Parse(numbers[0]);
int version = int.Parse(numbers[1]);
ExecutedVertexInstance vi = this.jobVertices.FindVertex(vertex, version);
if (vi == null)
{
// We have overshot the information about the vertices parsed from stdout; stop parsing here
success = false;
break;
}
if (le.Message == "Input channel status")
{
// Vertex=69.0, Name=Merge__446[3], MachPod=sherwood-005:pod1, TotalRead=1470802, TotalReadFromMach=1470802, TotalReadCrossMach=1470802, TotalReadCrossPod=0
long info = long.Parse(kvp["TotalRead"]);
vi.DataRead = info;
}
else if (le.Message == "Output channel status")
{
// Vertex=69.0, Name=Merge__446[3], MachPod=sherwood-005:pod1, TotalWrite=1213418
long info = long.Parse(kvp["TotalWrite"]);
vi.DataWritten = info;
}
}
sr.Close();
return success;
}
/// <summary>
/// The list of all stage names in the job.
/// </summary>
/// <returns>An iterator over the stage names.</returns>
public IEnumerable<string> GetStageNames()
{
return this.jobVertices.GetAllStageNames();
}
/// <summary>
/// The list of all stages in the job.
/// </summary>
/// <returns>An iterator over all stages.</returns>
public IEnumerable<DryadLinqJobStage> AllStages()
{
return this.GetStageNames().Select(this.GetStage);
}
/// <summary>
/// Generate a layout suitable for drawing the plan.
/// </summary>
/// <returns>A graph layout.</returns>
/// <param name="statusReporter">Delegate used to report errors.</param>
// ReSharper disable once UnusedParameter.Global
public GraphLayout ComputePlanLayout(StatusReporter statusReporter)
{
IEnumerable<DryadLinqJobStage> stages = this.AllStages().OrderBy(s => s.StartTime).ToList();
if (!stages.Any())
// no layout to compute
return null;
DateTime jobStartTime = this.StartJMTime;
DateTime lastTime = stages.Max(s => s.EndTime);
if (lastTime == jobStartTime)
// avoid the degenerate case
lastTime = jobStartTime + new TimeSpan(0, 0, 1);
GraphLayout result = new GraphLayout((lastTime - jobStartTime).TotalSeconds, stages.Count()*2);
int currentStage = 0;
foreach (DryadLinqJobStage s in stages)
{
// node represents the schedule: horizontal position is starttime - endtime
DateTime endTime = s.EndTime;
DateTime startTime = s.StartTime;
if (endTime <= jobStartTime) // unknown time?
endTime = lastTime; // assume still running
if (startTime <= jobStartTime)
startTime = jobStartTime;
GraphLayout.GraphNode node = new GraphLayout.GraphNode(
(startTime - jobStartTime).TotalSeconds, currentStage*2, (endTime - startTime).TotalSeconds, 1);
node.Shape = GraphLayout.GraphNode.NodeShape.Box;
node.Label = s.Name;
node.Stage = s.Name;
result.Add(node);
currentStage++;
}
return result;
}
/// <summary>
/// Find a vertex having specified the process id.
/// </summary>
/// <param name="id">Process id.</param>
/// <returns>The vertex with the specified guid.</returns>
public ExecutedVertexInstance FindVertex(DryadProcessIdentifier id)
{
return this.jobVertices.FindVertexByGuid(id.ToString());
}
/// <summary>
/// Invalidate the cached information.
/// </summary>
public void InvalidateCaches()
{
this.cachedStages.Clear();
}
}
/// <summary>
/// Summary information about a job plan.
/// </summary>
public abstract class DryadJobStaticPlan
{
/// <summary>
/// Connection between two stages.
/// </summary>
public class Connection
{
/// <summary>
/// Arity of connection.
/// </summary>
public enum ConnectionType
{
/// <summary>
/// Point-to-point connection between two stages.
/// </summary>
PointToPoint,
/// <summary>
/// Cross-product connection between two stages.
/// </summary>
AllToAll
};
/// <summary>
/// Type of channel backing the connection.
/// </summary>
public enum ChannelType
{
/// <summary>
/// Persistent file.
/// </summary>
DiskFile,
/// <summary>
/// In-memory fifo.
/// </summary>
Fifo,
/// <summary>
/// TCP pipe.
/// </summary>
TCP
}
/// <summary>
/// Stage originating the connection.
/// </summary>
public Stage From { internal set; get; }
/// <summary>
/// Stage terminating the connection.
/// </summary>
public Stage To { internal set; get; }
/// <summary>
/// Type of connection.
/// </summary>
public ConnectionType Arity { get; internal set; }
/// <summary>
/// Type of channel backing the connection.
/// </summary>
public ChannelType ChannelKind { get; internal set; }
/// <summary>
/// Dynamic manager associated with the connection.
/// </summary>
public string ConnectionManager { get; internal set; }
/// <summary>
/// Color used to represent the connection.
/// </summary>
/// <returns>A string describing the color.</returns>
public string Color()
{
switch (this.ChannelKind)
{
case ChannelType.DiskFile:
return "black";
case ChannelType.Fifo:
return "red";
case ChannelType.TCP:
return "yellow";
default:
throw new Exception("Unknown channel kind " + this.ChannelKind);
}
}
}
/// <summary>
/// Per-node connection information (should be per-edge...)
/// </summary>
protected struct ConnectionInformation
{
/// <summary>
/// Type of connection.
/// </summary>
public Connection.ConnectionType Arity { get; internal set; }
/// <summary>
/// Type of channel backing the connection.
/// </summary>
public Connection.ChannelType ChannelKind { get; internal set; }
/// <summary>
/// Dynamic manager associated with the connection.
/// </summary>
public string ConnectionManager { get; internal set; }
}
/// <summary>
/// Information about a stage.
/// </summary>
public class Stage
{
/// <summary>
/// Stage name.
/// </summary>
public string Name { get; internal set; }
/// <summary>
/// Code executed in the stage.
/// </summary>
public string [] Code { get; internal set; }
/// <summary>
/// DryadLINQ operator implemented by the stage.
/// </summary>
public string Operator { get; internal set; }
/// <summary>
/// Number of vertices in stage.
/// </summary>
public int Replication { get; internal set; }
/// <summary>
/// Unique identifier.
/// </summary>
public int Id { get; set; }
/// <summary>
/// True if the stage is an input.
/// </summary>
public bool IsInput { get; internal set; }
/// <summary>
/// True if the stage is an output.
/// </summary>
public bool IsOutput { get; internal set; }
/// <summary>
/// True if the stage is a tee.
/// </summary>
public bool IsTee { get; internal set; }
/// <summary>
/// True if the stage is a concatenation.
/// </summary>
public bool IsConcat { get; internal set; }
/// <summary>
/// True if the stage is virtual (no real vertices synthesized).
/// </summary>
public bool IsVirtual { get { return this.IsInput || this.IsOutput || this.IsTee || this.IsConcat; } }
/// <summary>
/// Only defined for tables.
/// </summary>
public string Uri { get; internal set; }
/// <summary>
/// Only defined for tables.
/// </summary>
public string UriType { get; internal set; }
}
/// <summary>
/// Map from stage id to stage.
/// </summary>
protected readonly Dictionary<int, Stage> stages;
/// <summary>
/// List of inter-stage connections in the plan.
/// </summary>
protected readonly List<Connection> connections;
/// <summary>
/// Store here per-node connection information (map from node id).
/// </summary>
protected readonly Dictionary<int, ConnectionInformation> perNodeConnectionInfo;
/// <summary>
/// Stream containing the plan.
/// </summary>
protected readonly ISharedStreamReader planStream;
/// <summary>
/// Create a dryadlinq job plan starting from an xml plan file.
/// </summary>
/// <param name="plan">Stream containing the plan.</param>
protected DryadJobStaticPlan(ISharedStreamReader plan)
{
if (plan.Exception != null)
// don't do this
throw plan.Exception;
this.planStream = plan;
this.stages = new Dictionary<int, Stage>();
this.connections = new List<Connection>();
this.perNodeConnectionInfo = new Dictionary<int, ConnectionInformation>();
this.fictitiousStages = 0;
}
/// <summary>
/// Parse the query plan: cluster-specific.
/// </summary>
protected abstract void ParseQueryPlan();
int fictitiousStages;
/// <summary>
/// Create a fictitious node for the job manager.
/// </summary>
public void AddFictitiousStages()
{
this.fictitiousStages = 2;
Stage stage = new Stage();
stage.Id = -1;
stage.Replication = 1;
stage.Operator = "Job Manager";
stage.Name = "JobManager";
stage.Code = null;
this.stages.Add(stage.Id, stage);
stage = new Stage();
stage.Id = -2;
stage.Replication = 1;
stage.Operator = "All vertices";
stage.Name = "All vertices";
stage.Code = null;
this.stages.Add(stage.Id, stage);
}
/// <summary>
/// Find the stage given the stage id as a string.
/// </summary>
/// <param name="stageId">Stage id.</param>
/// <returns>A handle to the stage with the specified static Id.</returns>
public Stage GetStageByStaticId(string stageId)
{
int id = int.Parse(stageId);
return this.stages[id];
}
/// <summary>
/// Find the stage given the stage name.
/// </summary>
/// <param name="name">Name of stage to return.</param>
/// <returns>The stage with the given name or null.</returns>
public Stage GetStageByName(string name)
{
foreach (Stage s in this.stages.Values)
{
if (s.Name.Equals(name))
return s;
}
return null;
}
/// <summary>
/// The list of all stages in the plan.
/// </summary>
/// <returns>An iterator over the list of stages.</returns>
public IEnumerable<Stage> GetAllStages()
{
return this.stages.Values;
}
/// <summary>
/// The list of all connections in the plan.
/// </summary>
/// <returns>An iterator over a list of connections.</returns>
public IEnumerable<Connection> GetAllConnections()
{
return this.connections;
}
/// <summary>
/// Number of stages in static plan.
/// </summary>
public int StageCount
{
get
{
return this.stages.Count - this.fictitiousStages;
}
}
/// <summary>
/// Get all connections adjacent to a stage. Warning: this method is inefficient.
/// </summary>
/// <param name="inputs">If true return the stage inputs, else return the stage outputs.</param>
/// <param name="stage">Stage we are looking for.</param>
/// <returns>A list of connections.</returns>
public IEnumerable<Connection> GetStageConnections(Stage stage, bool inputs)
{
foreach (Connection c in this.GetAllConnections())
{
if (inputs && c.To == stage)
yield return c;
else if (!inputs && c.From == stage)
yield return c;
}
}
/// <summary>
/// Factory: create the plan for a given job.
/// </summary>
/// <param name="dryadLinqJobInfo">Job to create plan for.</param>
/// <param name="reporter">Delegate used to report errors.</param>
/// <returns>The plan or null.</returns>
public static DryadJobStaticPlan CreatePlan(DryadLinqJobInfo dryadLinqJobInfo, StatusReporter reporter)
{
reporter("Trying to build static plan", StatusKind.LongOp);
ClusterConfiguration config = dryadLinqJobInfo.ClusterConfiguration;
IClusterResidentObject file = config.JobQueryPlan(dryadLinqJobInfo.Summary);
if (config is CacheClusterConfiguration)
config = (config as CacheClusterConfiguration).ActualConfig(dryadLinqJobInfo.Summary);
if (file.Exception == null)
{
DryadJobStaticPlan retval;
{
retval = new DryadLinqJobStaticPlan(config, file.GetStream());
}
retval.ParseQueryPlan();
return retval;
}
else
{
reporter("Exception while looking for plan " + file.Exception.Message, StatusKind.Error);
return null;
}
}
}
/// <summary>
/// DryadLINQ-specific job information.
/// </summary>
public class DryadLinqJobStaticPlan : DryadJobStaticPlan
{
/// <summary>
/// Create a DryadLinqJobStaticPlan.
/// </summary>
/// <param name="config">Cluster configuration.</param>
/// <param name="planFile">Stream containing the file.</param>
// ReSharper disable once UnusedParameter.Local
public DryadLinqJobStaticPlan(ClusterConfiguration config, ISharedStreamReader planFile)
: base(planFile)
{
}
/// <summary>
/// Parse an XML query plan and represent that information.
/// </summary>
protected override void ParseQueryPlan()
{
string planString = this.planStream.ReadToEnd();
XDocument plan = XDocument.Parse(planString);
// ReSharper disable PossibleNullReferenceException
XElement query = plan.Root.Elements().First(e => e.Name == "QueryPlan");
IEnumerable<XElement> vertices = query.Elements().Where(e => e.Name == "Vertex");
foreach (XElement v in vertices)
{
Stage stage = new Stage();
stage.Id = int.Parse(v.Element("UniqueId").Value);
stage.Replication = int.Parse(v.Element("Partitions").Value);
stage.Operator = v.Element("Type").Value;
stage.Name = v.Element("Name").Value;
{
string code = v.Element("Explain").Value;
string[] lines = code.Split('\n');
if (lines.Length > 1)
{
stage.Code = lines.Skip(1). // drop stage name
Select(l => l.Trim()). // remove leading tab
ToArray();
}
else
{
stage.Code = new string[] { };
}
}
this.stages.Add(stage.Id, stage);
{
// These should be connection attributes, not stage attributes.
string cht = v.Element("ChannelType").Value;
string connectionManager = v.Element("DynamicManager").Element("Type").Value;
string connection = v.Element("ConnectionOperator").Value;
ConnectionInformation info = new ConnectionInformation();
info.ConnectionManager = connectionManager;
switch (connection)
{
case "Pointwise":
info.Arity = Connection.ConnectionType.PointToPoint;
break;
case "CrossProduct":
info.Arity = Connection.ConnectionType.AllToAll;
break;
default:
throw new CalypsoDryadException("Don't know about connection of type " + connection);
}
switch (cht)
{
case "DiskFile":
info.ChannelKind = Connection.ChannelType.DiskFile;
break;
case "TCPPipe":
info.ChannelKind = Connection.ChannelType.TCP;
break;
case "MemoryFIFO":
info.ChannelKind = Connection.ChannelType.Fifo;
break;
default:
throw new CalypsoDryadException("Don't know about channel of type " + cht);
}
this.perNodeConnectionInfo.Add(stage.Id, info);
}
switch (stage.Operator)
{
case "InputTable":
stage.IsInput = true;
stage.UriType = v.Element("StorageSet").Element("Type").Value;
stage.Uri = v.Element("StorageSet").Element("SourceURI").Value;
break;
case "OutputTable":
stage.IsOutput = true;
stage.UriType = v.Element("StorageSet").Element("Type").Value;
stage.Uri = v.Element("StorageSet").Element("SinkURI").Value;
break;
case "Tee":
stage.IsTee = true;
break;
case "Concat":
stage.IsConcat = true;
break;
}
if (!v.Elements("Children").Any())
continue;
bool first = true;
var children = v.Element("Children").Elements().Where(e => e.Name == "Child").ToList();
foreach (XElement child in children)
{
// This code parallels the graphbuilder.cpp for XmlExecHost
Connection conn = new Connection();
int fromid = int.Parse(child.Element("UniqueId").Value);
ConnectionInformation fromConnectionInformation = this.perNodeConnectionInfo[fromid];
Stage from = this.stages[fromid];
conn.From = from;
conn.To = stage;
conn.ChannelKind = fromConnectionInformation.ChannelKind;
ConnectionInformation thisConnectionInformation = this.perNodeConnectionInfo[stage.Id];
switch (thisConnectionInformation.ConnectionManager)
{
case "FullAggregator":
case "HashDistributor":
case "RangeDistributor":
// Ignore except first child
if (first)
{
first = false;
conn.ConnectionManager = thisConnectionInformation.ConnectionManager;
}
else
{
conn.ConnectionManager = "";
}
break;
case "PartialAggregator":
case "Broadcast":
// All children have the same connection manager
conn.ConnectionManager = thisConnectionInformation.ConnectionManager;
break;
case "Splitter":
// The connection manager depends on the number of children
if (first)
{
first = false;
if (children.Count() == 1)
conn.ConnectionManager = thisConnectionInformation.ConnectionManager;
else
conn.ConnectionManager = "SemiSplitter";
}
else
{
conn.ConnectionManager = "";
}
break;
case "None":
case "":
break;
}
conn.Arity = fromConnectionInformation.Arity;
this.connections.Add(conn);
}
}
// ReSharper restore PossibleNullReferenceException
}
}
/// <summary>
/// Scope-specific job information.
/// </summary>
public class ScopeJobStaticPlan : DryadJobStaticPlan
{
private readonly ISharedStreamReader vertexDef;
/// <summary>
/// Create a ScopeJobStaticPlan.
/// </summary>
/// <param name="config">Cluster configuration.</param>
/// <param name="planFile">Stream containing the file.</param>
/// <param name="vertexDef">File containing the vertex definition (ScopeVertexDef.xml).</param>
// ReSharper disable once UnusedParameter.Local
public ScopeJobStaticPlan(ClusterConfiguration config, ISharedStreamReader planFile, ISharedStreamReader vertexDef)
: base(planFile)
{
this.vertexDef = vertexDef;
}
/// <summary>
/// Simplify the name of an IO.
/// </summary>
/// <param name="ioname">Name to simplify.</param>
/// <returns>The simplified name.</returns>
private static string NormalizeIOName(string ioname)
{
// drop everything between square braces
Regex re = new Regex(@"\[.*\]");
string result = re.Replace(ioname, "");
return result;
}
/// <summary>
/// Parse the Algebra file.
/// </summary>
private void ParseAlgebra()
{
// TODO: this parser is not really complete, as I don't understand the semantics of all xml elements.
Dictionary<string, string> outToStage = new Dictionary<string, string>(); // map an output to a stage name. Assume that ios have unique names.
Dictionary<string, List<string>> inputs = new Dictionary<string, List<string>>();
// <CsJobAlgebra> <graph> <process> ...
string planString = this.planStream.ReadToEnd();
XDocument plan = XDocument.Parse(planString);
// ReSharper disable PossibleNullReferenceException
XElement graph = plan.Root.Element("graph"); // graph node, children are stages
// add stages
int id = 0;
foreach (XElement child in graph.Elements())
{
if (child.Name == "process")
{
string stageName = child.Attribute("id").Value;
Stage stage = new Stage();
stage.Name = stageName;
stage.Replication = 1;
stage.Code = new string[] { child.Attribute("command").Value };
stage.Id = id++;
this.stages.Add(stage.Id, stage);
List<string> stageInputs = new List<string>();
inputs.Add(stageName, stageInputs);
foreach (var io in child.Elements())
{
if (io.Name != "input" && io.Name != "output")
continue;
string cosmosStream = io.Attribute("cosmosStream") != null ? io.Attribute("cosmosStream").Value : null;
string structuredStream = io.Attribute("structuredStream") != null ? io.Attribute("structuredStream").Value : null;
string ioid = NormalizeIOName(io.Attribute("id").Value);
string streamname = cosmosStream ?? structuredStream;
if (io.Name == "input")
{
stageInputs.Add(ioid);
if (streamname != null)
{
Stage alreadyDone = this.GetStageByName(streamname);
if (alreadyDone == null)
{
Stage input = new Stage();
input.Id = id++;
input.IsInput = true;
input.Name = streamname;
input.Code = new string[0];
input.Replication = 1;
input.UriType = "cosmos";
input.Uri = "cosmos://" + streamname;
this.stages.Add(input.Id, input);
}
outToStage.Add(ioid, streamname);
}
}
else if (io.Name == "output")
{
outToStage.Add(ioid, stageName);
if (streamname != null)
{
Stage alreadyDone = this.GetStageByName(streamname);
if (alreadyDone == null)
{
Stage output = new Stage();
output.IsOutput = true;
output.Replication = 1;
output.Name = streamname;
output.Code = new string[0];
output.Id = id++;
output.UriType = "cosmos";
output.Uri = "cosmos://" + streamname;
this.stages.Add(output.Id, output);
inputs.Add(streamname, new List<string> { ioid });
}
}
}
}
}
else if (child.Name == "dataset")
{
string stageName = child.Attribute("id").Value;
stageName = NormalizeIOName(stageName);
Stage stage = new Stage();
stage.Name = stageName;
stage.Replication = 1;
stage.IsConcat = true;
stage.Code = new string[0];
stage.Id = id++;
this.stages.Add(stage.Id, stage);
List<string> stageInputs = new List<string>();
inputs.Add(stageName, stageInputs);
foreach (var io in child.Elements())
{
if (io.Name == "element")
{
string ioid = NormalizeIOName(io.Attribute("id").Value);
stageInputs.Add(ioid);
}
}
// implicit output with stage name
outToStage.Add(stage.Name, stage.Name);
}
else if (child.Name == "inputStreamList")
{
Stage input = new Stage();
input.Id = id++;
input.IsInput = true;
input.Name = child.Attribute("id").Value;
input.Replication = 1;
input.UriType = "cosmos";
input.Uri = "inputStreamList";
input.Code = child.Elements().Select(e => e.Attribute("cosmosPath").Value).ToArray();
this.stages.Add(input.Id, input);
outToStage.Add(input.Name, input.Name);
}
else if (child.Name == "outputStreamList")
{
Stage output = new Stage();
output.Id = id++;
output.IsOutput = true;
output.Name = child.Attribute("id").Value;
output.Replication = 1;
output.UriType = "cosmos";
output.Uri = "outputStreamList";
output.Code = child.Elements().Select(e => e.Attribute("cosmosPath").Value).ToArray();
this.stages.Add(output.Id, output);
inputs.Add(output.Name, new List<string> { output.Name });
}
}
// scan the dictionaries and build the edges
foreach (string stage in inputs.Keys)
{
foreach (string inputName in inputs[stage])
{
string iName = inputName;
if (outToStage.ContainsKey(iName))
{
string sourceStage = outToStage[iName];
Connection conn = new Connection();
conn.From = this.GetStageByName(sourceStage);
conn.To = this.GetStageByName(stage);
conn.ChannelKind = Connection.ChannelType.DiskFile;
conn.ConnectionManager = "";
conn.Arity = Connection.ConnectionType.PointToPoint;
this.connections.Add(conn);
}
else
{
Trace.TraceInformation("Could not find stage for input {0}", iName);
}
}
}
// ReSharper restore PossibleNullReferenceException
}
/// <summary>
/// Parse the vertex definition file.
/// </summary>
private void ParseVertexDef()
{
if (this.vertexDef.Exception != null)
return;
// <ScopeVertices> <ScopeVertex> <operator> <input> </input> <output> </output>
string planString = this.vertexDef.ReadToEnd();
XDocument vxDef = XDocument.Parse(planString);
XElement vertices = vxDef.Root;
// ReSharper disable PossibleNullReferenceException
foreach (XElement vertex in vertices.Elements())
{
List<string> code = new List<string>();
string id = vertex.Attribute("id").Value;
Stage stage = this.GetStageByName(id);
if (stage == null)
{
Trace.TraceInformation("Could not find stage {0}", id);
continue;
}
foreach (XElement op in vertex.Elements("operator"))
{
string className = op.Attribute("className").Value;
if (op.Attribute("args") != null)
className += " " + op.Attribute("args").Value;
code.Add(className);
foreach (XElement input in op.Elements("input"))
{
XAttribute indexatt = input.Attribute("inputIndex");
string index = indexatt != null ? indexatt.Value : " ";
string schema = input.Attribute("schema").Value;
code.Add("\tI" + index + ": " + schema);
}
foreach (XElement output in op.Elements("output"))
{
XAttribute indexatt = output.Attribute("outputIndex");
string index = indexatt != null ? indexatt.Value : " ";
string schema = output.Attribute("schema").Value;
code.Add("\tO" + index + ": " + schema);
}
}
stage.Code = code.ToArray();
// ReSharper restore PossibleNullReferenceException
}
}
/// <summary>
/// Parse the query plan for a Scope job.
/// </summary>
protected override void ParseQueryPlan()
{
this.ParseAlgebra();
this.ParseVertexDef();
}
}
/// <summary>
/// A collection describing all the vertices encountered so far in a job.
/// </summary>
internal class JobVertices
{
/// <summary>
/// Vertices indexed by numeric vertex id. A vertex can have multiple executions.
/// </summary>
private readonly Dictionary<int, List<ExecutedVertexInstance>> vertices;
/// <summary>
/// Total number of vertices in collection.
/// </summary>
private int count;
private readonly Dictionary<string, List<ExecutedVertexInstance>> jobStages;
private readonly Dictionary<string, ExecutedVertexInstance> vertexByGuid;
/// <summary>
/// Create a collection representing the job vertices.
/// </summary>
public JobVertices()
{
this.count = 0;
this.vertexByGuid = new Dictionary<string, ExecutedVertexInstance>();
this.vertices = new Dictionary<int, List<ExecutedVertexInstance>>();
this.jobStages = new Dictionary<string, List<ExecutedVertexInstance>>();
this.jobStages.Add("All vertices", new List<ExecutedVertexInstance>()); // this list holds all vertices
}
/// <summary>
/// Number of stages in job.
/// </summary>
public int ExecutedStageCount { get { return this.jobStages.Count - 2; } } // subtract GM and all vertices
/// <summary>
/// The list of vertices in this stage.
/// </summary>
/// <param name="stagename">Name of stage to return.</param>
/// <returns>The vertices in the stage, or null if the stage is not found (e.g., it is a table).</returns>
public List<ExecutedVertexInstance> GetStageVertices(string stagename)
{
if (this.jobStages.ContainsKey(stagename))
return this.jobStages[stagename];
return null;
}
/// <summary>
/// The list of all stage names.
/// </summary>
/// <returns>An iterator over the stage names.</returns>
public IEnumerable<string> GetAllStageNames()
{
return this.jobStages.Keys;
}
/// <summary>
/// Number of vertices in job.
/// </summary>
public int Count
{
get { return this.count; }
}
/// <summary>
/// Add a new vertex to this job.
/// </summary>
/// <param name="vi">Vertex description to add.</param>
/// <returns>Stage name that the vertex belongs to.</returns>
public void Add(ExecutedVertexInstance vi)
{
int id = vi.Number;
List<ExecutedVertexInstance> l;
if (vertices.ContainsKey(id))
l = vertices[id];
else
{
l = new List<ExecutedVertexInstance>();
vertices.Add(id, l);
}
l.Add(vi);
this.count++;
string stage = vi.StageName;
List<ExecutedVertexInstance> members;
if (this.jobStages.ContainsKey(stage))
members = this.jobStages[stage];
else
{
members = new List<ExecutedVertexInstance>();
this.jobStages.Add(stage, members);
}
members.Add(vi);
if (!this.vertexByGuid.ContainsKey(vi.UniqueID))
this.vertexByGuid.Add(vi.UniqueID, vi);
this.jobStages["All vertices"].Add(vi);
}
/// <summary>
/// Find the information associated with a given vertex and version.
/// </summary>
/// <param name="id">Vertex number in job.</param>
/// <param name="version">Vertex version.</param>
/// <returns>Matching VertexInfo or null if no vertex exists.</returns>
public ExecutedVertexInstance FindVertex(int id, int version)
{
if (!this.vertices.ContainsKey(id))
return null;
List<ExecutedVertexInstance> l = this.vertices[id];
foreach (ExecutedVertexInstance i in l)
if (i.Version == version)
return i;
return null;
}
/// <summary>
/// Find a vertex from its guid. Currently very slow.
/// </summary>
/// <param name="guid">Vertex guid.</param>
/// <returns>The vertex with the correct guid, or null.</returns>
public ExecutedVertexInstance FindVertexByGuid(string guid)
{
if (this.vertexByGuid.ContainsKey(guid))
return this.vertexByGuid[guid];
return null;
}
/// <summary>
/// The set of all vertices in the job.
/// </summary>
/// <returns>An enumerator over VertexInfo objects.</returns>
public IEnumerable<ExecutedVertexInstance> AllVertices()
{
foreach (int key in this.vertices.Keys)
{
List<ExecutedVertexInstance> l = this.vertices[key];
foreach (ExecutedVertexInstance vi in l)
yield return vi;
}
}
/// <summary>
/// A vertex has received a new guid.
/// </summary>
/// <param name="vi">Executed vertex instance.</param>
/// <param name="newGuid">New guid.</param>
internal void Remap(ExecutedVertexInstance vi, string newGuid)
{
if (!this.vertexByGuid.ContainsKey(newGuid))
this.vertexByGuid.Add(newGuid, vi);
}
}
/// <summary>
/// Brief description of a channel endpoint.
/// </summary>
public class ChannelEndpointDescription
{
/// <summary>
/// Is the endpoint of this channel an input?
/// </summary>
public bool IsInput { get; protected set; }
/// <summary>
/// The input/output number.
/// </summary>
public int Number { get; protected set; }
/// <summary>
/// Type of URI.
/// </summary>
public string UriType { get; protected set; }
/// <summary>
/// Part of URI without the type to the channel contents.
/// </summary>
public string LocalPath { get; protected set; }
/// <summary>
/// How big is the channel (0 if it cannot be determined, e.g. FIFO, -1 if the channel data cannot be retrieved).
/// </summary>
public long Size { get; set; }
/// <summary>
/// String representation of the endpoint.
/// </summary>
public override string ToString()
{
string uritype = this.UriType;
string localpath = this.LocalPath;
return string.Format("{0,4} {1,20:N0} {2}://{3}", this.Number, this.Size, uritype, localpath);
}
/// <summary>
/// Create a channel endpoint description
/// </summary>
/// <param name="isInput">True if the channel endpoint is an input.</param>
/// <param name="number">The input/output number.</param>
/// <param name="uri">URI to channel contents.</param>
/// <param name="uripathprefix">Relative uris will need this prefix appended.</param>
/// <param name="fast">If true the channel size is not computed (this is much faster).</param>
/// <param name="reporter">Delegate used to report errors.</param>
// ReSharper disable once UnusedParameter.Local
public ChannelEndpointDescription(bool isInput, int number, string uri, string uripathprefix, bool fast, StatusReporter reporter)
{
this.IsInput = isInput;
this.Number = number;
int sepindex = uri.IndexOf("://");
if (sepindex < 0)
throw new CalypsoDryadException("Channel URI " + uri + " does not contain separator ://");
this.UriType = uri.Substring(0, sepindex);
// some HPC URIs use the compression scheme as an "option" (not really defined for file:// uris, but...)
// strip it here
int option = uri.IndexOf('?');
if (option >= 0)
{
uri = uri.Substring(0, option);
}
this.LocalPath = uri.Substring(sepindex + 3);
if (uripathprefix != null) {
// Unfortunately the uri is absolute, although it should be relative sometimes. We fix this here.
this.LocalPath = Path.Combine(uripathprefix, this.LocalPath);
}
if (fast)
this.Size = 0;
else
{
switch (this.UriType)
{
case "file":
if (File.Exists(this.LocalPath))
{
this.Size = new FileInfo(this.LocalPath).Length;
}
else
{
this.Size = -1;
}
break;
default:
this.Size = 0;
break;
}
}
}
/// <summary>
/// Create a channel endpoint description from Scope information.
/// </summary>
/// <param name="isInput">True if the channel endpoint is an input.</param>
/// <param name="number">The input/output number.</param>
/// <param name="uri">URI to channel contents.</param>
/// <param name="size">Size of channel if known.</param>
public ChannelEndpointDescription(bool isInput, int number, string uri, long size)
{
this.IsInput = isInput;
this.Number = number;
int sepindex = uri.IndexOf("://");
if (sepindex < 0)
throw new CalypsoClusterException("Channel URI " + uri + " does not contain separator ://");
this.UriType = uri.Substring(0, sepindex);
this.LocalPath = uri.Substring(sepindex + 3);
this.Size = size;
}
}
/// <summary>
/// An instance of an executed vertex (each vertex may execute multiple times).
/// </summary>
public class ExecutedVertexInstance
{
/// <summary>
/// State that the vertex is in.
/// </summary>
public enum VertexState
{
/// <summary>
/// Scheduled but never executed (duplicate scheduling abandoned).
/// </summary>
Abandoned,
/// <summary>
/// Vertex has been created but not started.
/// </summary>
Created,
/// <summary>
/// Vertex has started running, but has not yet completed.
/// </summary>
Started,
/// <summary>
/// Vertex has been cancelled.
/// </summary>
Cancelled,
/// <summary>
/// Vertex has been cancelled after completing successfully.
/// </summary>
Invalidated,
/// <summary>
/// Vertex has completed successfully.
/// </summary>
Successful,
/// <summary>
/// Vertex has failed.
/// </summary>
Failed,
/// <summary>
/// Vertex has been cancelled by the scheduler.
/// </summary>
Revoked,
/// <summary>
/// Vertex state is not yet known.
/// </summary>
Unknown,
};
/// <summary>
/// State the vertex is in.
/// </summary>
public VertexState State { get; protected set; }
/// <summary>
/// The error message related to this vertex.
/// </summary>
string error;
/// <summary>
/// Directory where the vertex executed.
/// </summary>
public IClusterResidentObject WorkDirectory { get; protected set; }
/// <summary>
/// Amount of data read by vertex (may be unknown, then it's -1).
/// </summary>
public long DataRead { get; internal set; }
/// <summary>
/// Amount of data written by vertex (may be unknown, then it's -1).
/// </summary>
public long DataWritten { get; internal set; }
/// <summary>
/// On some platforms this is a guid, but not always. At least the identifier is unique per job.
/// </summary>
public string UniqueID { get; protected set; }
/// <summary>
/// String representation of the cluster configuration type.
/// </summary>
private readonly string ClusterConfigType;
// <ErrorCode>0x830a0017<!-- Vertex Had Errors --></ErrorCode>
static readonly Regex errorCodeRegex = new Regex(@"Vertex Had Errors, \<ErrorCode\>(.*)\<!--.*--\>\</ErrorCode\>", RegexOptions.Compiled);
// Super__128[0][1] -> stage name is Super__128[0]
static readonly Regex stageNameRegex = new Regex(@"(.*)\[(\d+)\]$", RegexOptions.Compiled);
/// <summary>
/// Dynamic names have really strange names: Dynamic__128[13]+[0]. We want to bundle all these into a single dynamic stage.
/// </summary>
static readonly Regex dynamicStageNameRegex = new Regex(@"(Dynamic__(\d+))\[\d+\](\+\[(\d+)\])*$", RegexOptions.Compiled);
/// <summary>
/// Some scope stages have double indices.
/// </summary>
static readonly Regex scopeStageNameRegex = new Regex(@"([^]]*)(\[\d+\](\[\d+\])*)$", RegexOptions.Compiled);
/// <summary>
/// Create a vertex information.
/// </summary>
/// <param name="job">Information about the current job.</param>
/// <param name="number">Vertex number, unique in job.</param>
/// <param name="version">Vertex version.</param>
/// <param name="name">Name of vertex in graph.</param>
/// <param name="uniqueId">Unique vertex identifier; on some platforms the value is not correct at this point.</param>
/// <param name="timeStamp">Time when vertex was created; maybe MinValue if unknown.</param>
public ExecutedVertexInstance(DryadLinqJobInfo job, int number, int version, string name, string uniqueId, DateTime timeStamp)
{
this.Number = number;
this.Name = name;
this.Version = version;
this.ProcessIdentifier = new DryadProcessIdentifier();
this.IsManager = false;
this.DataRead = -1;
this.DataWritten = -1;
this.State = VertexState.Created;
this.error = "";
this.Machine = "";
this.timingSet = false;
this.UniqueID = uniqueId;
this.ClusterConfigType = job.ClusterConfiguration.GetType().ToString();
this.channelsAreFinal = false;
this.ComputeStageName();
this.CreationTime = timeStamp;
this.Start = this.StartCommandTime = this.VertexScheduleTime = this.End = DateTime.MinValue;
}
/// <summary>
/// The vertex has started.
/// </summary>
/// <param name="machine">Machine on which vertex is run.</param>
/// <param name="job">Job containing the vertex.</param>
/// <param name="approxStartTime">Approximate starting time (the real value is known when the vertex is terminated).</param>
/// <param name="identifier">Id of process running this vertex (several vertices may share a process).</param>
/// <param name="uniqueId">Unique identifier.</param>
public void SetStartInformation(DryadLinqJobInfo job, string machine, DateTime approxStartTime, DryadProcessIdentifier identifier, string uniqueId)
{
this.Machine = machine;
this.Start = approxStartTime;
this.ProcessIdentifier = identifier;
this.WorkDirectory = job.ClusterConfiguration.ProcessWorkDirectory(this.ProcessIdentifier, false, machine, job.Summary);
this.StdoutFile = job.ClusterConfiguration.ProcessStdoutFile(this.ProcessIdentifier, false, machine, job.Summary);
this.SetState(VertexState.Started);
if (approxStartTime == DateTime.MinValue)
throw new CalypsoDryadException("Unexpected small start time for vertex");
this.LogDirectory = job.ClusterConfiguration.ProcessLogDirectory(this.ProcessIdentifier, false, machine, job.Summary);
this.LogFilesPattern = job.ClusterConfiguration.VertexLogFilesPattern(false, job.Summary);
this.UniqueID = uniqueId;
if (this.StdoutFile != null)
this.StdoutFile.ShouldCacheLocally = false; // don't cache until vertex proved terminated
if (this.LogDirectory != null)
this.LogDirectory.ShouldCacheLocally = false;
}
/// <summary>
/// Is this "vertex" a job manager?
/// </summary>
public bool IsManager { get; internal set; }
/// <summary>
/// Path to file containing logged vertex standard output.
/// </summary>
public IClusterResidentObject StdoutFile { get; protected set; }
/// <summary>
/// Path to directory containing the logs generated by this vertex.
/// </summary>
public IClusterResidentObject LogDirectory { get; protected set; }
/// <summary>
/// Pattern matching the log files generated by this vertex in the log directory.
/// </summary>
public string LogFilesPattern { get; protected set; }
/// <summary>
/// Guid of the process which contained this vertex (and maybe other vertices too).
/// </summary>
public DryadProcessIdentifier ProcessIdentifier { get; internal set; }
/// <summary>
/// Vertex version (each vertex may execute multiple times.)
/// </summary>
public int Version { get; protected set; }
/// <summary>
/// Machine where vertex ran.
/// </summary>
public string Machine { get; protected set; }
/// <summary>
/// Name of vertex, such as Select__1[3].
/// </summary>
public string Name { get; protected set; }
/// <summary>
/// Return vertex number.
/// </summary>
public int Number { get; set; }
/// <summary>
/// If this is true the cached channels won't change anymore.
/// </summary>
private bool channelsAreFinal;
/// <summary>
/// The input channels of this vertex; this must be explicitly computed by invoking DiscoverChannels(), since it's expensive.
/// </summary>
public Dictionary<int, ChannelEndpointDescription> InputChannels { get; set; }
/// <summary>
/// The output channels of this vertex; this must be explicitly computed by invoking DiscoverChannels(), since it's expensive.
/// </summary>
public Dictionary<int, ChannelEndpointDescription> OutputChannels { get; set; }
/// <summary>
/// Compute the stage name and partition number.
/// </summary>
private void ComputeStageName()
{
if (this.ClusterConfigType.Contains("Scope"))
{
// Stage names extactred from the Scope algebra look different
Match m = scopeStageNameRegex.Match(this.Name);
if (m.Success)
{
this.stageName = m.Groups[1].Value;
}
else
{
this.stageName = this.Name;
}
}
else
{
Match m = dynamicStageNameRegex.Match(this.Name);
if (m.Success)
{
this.stageName = m.Groups[1].Value;
}
else
{
m = stageNameRegex.Match(this.Name);
if (m.Success)
{
this.stageName = m.Groups[1].Value;
}
else
{
this.stageName = this.Name;
}
}
}
}
private string stageName;
/// <summary>
/// Try to guess the stage name from the vertex name. Not easy for generic Dryad jobs, doable for DryadLINQ jobs.
/// </summary>
public string StageName
{
// cache the result
get {
if (this.stageName == null) {
this.ComputeStageName();
}
return this.stageName;
}
}
#region TIMING_INFORMATION
/// <summary>
/// True if the timing information has been set.
/// </summary>
private bool timingSet;
/// <summary>
/// Time when vertex was created.
/// </summary>
public DateTime CreationTime { get; internal set; }
/// <summary>
/// Time when vertex process is scheduled.
/// </summary>
public DateTime VertexScheduleTime { get; internal set; }
/// <summary>
/// Time when process start command is issued to the PN.
/// </summary>
public DateTime StartCommandTime { get; internal set; }
/// <summary>
/// Time when process is created by PN.
/// </summary>
public DateTime Start { get; protected set; }
/// <summary>
/// Time when vertex has completed
/// </summary>
public DateTime End { get; internal set; }
/// <summary>
/// How long has the vertex been running?
/// </summary>
public TimeSpan RunningTime
{
get
{
if (this.Start != DateTime.MinValue && this.End != DateTime.MinValue)
return this.End - this.Start;
else
return TimeSpan.MinValue;
}
}
/// <summary>
/// Mark the fact that the vertex was still running at the specified time.
/// </summary>
/// <param name="when">Time when vertex was known to be running.</param>
public void MarkVertexWasRunning(DateTime when)
{
if (this.Start == DateTime.MinValue)
{
Trace.TraceInformation("Vertex {0} which is not started is still running?", this.Name);
return;
//throw new CalypsoClusterException("Vertex which is not started is still running?");
}
if (this.Start > when)
// This can happen if the cluster clocks are not synchronized with the local machine clocks.
return;
this.End = when;
}
/// <summary>
/// Set the timing parameters for a vertex execution.
/// Some times can be negative, if the respective states were actually never certainly reached.
/// </summary>
/// <param name="creation">Absolute time of vertex creation.</param>
/// <param name="creatToScheduleTime">Number of seconds from creation to process being ready to run.</param>
/// <param name="schedToStartProcessTime">Number of seconds from ready to run to process dispatch to PN.</param>
/// <param name="startToCreatedProcessTime">Number of seconds from start to process creation on PN.</param>
/// <param name="processToRunTime">Number of seconds from process creation to process running on PN.</param>
/// <param name="runToCompTime">Actual running time of process.</param>
internal DateTime SetTiming(DateTime creation,
string creatToScheduleTime, string schedToStartProcessTime, string startToCreatedProcessTime, string processToRunTime, string runToCompTime)
{
if (this.timingSet)
// a vertex may be cancelled after it's already terminated, e.g, due to a missing output.
// but the second timing information is incorrect
return DateTime.MinValue;
if (this.State == VertexState.Successful || this.State == VertexState.Failed)
// allow the timing to be set again for cancelled vertices
this.timingSet = true;
this.CreationTime = creation;
double ctst = Double.Parse(creatToScheduleTime);
double stsp = Double.Parse(schedToStartProcessTime);
double stcp = Double.Parse(startToCreatedProcessTime);
double ptrt = Double.Parse(processToRunTime);
double rtct = Double.Parse(runToCompTime);
double totSeconds = ctst + stsp + stcp + ptrt + rtct;
TimeSpan total = TimeSpan.FromSeconds(totSeconds);
DateTime totalTime = creation + total;
if (totSeconds < 0)
throw new CalypsoDryadException("Negative total time for vertex " + this.Name);
// if the vertex has no machine just ignore the times
if (string.IsNullOrEmpty(this.Machine))
return totalTime;
if (ctst >= 0)
{
TimeSpan creatToSchedule = TimeSpan.FromSeconds(ctst);
this.VertexScheduleTime = this.CreationTime + creatToSchedule;
if (stsp >= 0)
{
TimeSpan schedToStartProcess = TimeSpan.FromSeconds(stsp);
this.StartCommandTime = this.VertexScheduleTime + schedToStartProcess;
if (stcp >= 0)
{
TimeSpan startToCreatedProcess = TimeSpan.FromSeconds(stcp);
if (ptrt >= 0)
{
TimeSpan processToRun = TimeSpan.FromSeconds(ptrt);
this.Start = this.StartCommandTime + startToCreatedProcess + processToRun;
if (rtct >= 0)
{
TimeSpan runToComp = TimeSpan.FromSeconds(rtct);
this.End = this.Start + runToComp;
}
}
}
}
}
return totalTime;
}
#endregion
/// <summary>
/// Set termination status of a vertex based on the status string.
/// </summary>
/// <param name="status">Status reported by job manager.</param>
/// <returns>Failure status of vertex: false if it failed with error, true if it succeeded or it was a killed duplicate.</returns>
/// <param name="time">Time when vertex terminated. Maybe MinValue if unknown.</param>
internal bool SetTermination(string status, DateTime time)
{
if (status.ToLower().StartsWith("vertex has completed"))
{
this.SetState(VertexState.Successful);
}
else if (status.ToLower().StartsWith("killed"))
{
this.SetState(VertexState.Cancelled);
}
else if (status.ToLower().StartsWith("failed"))
{
this.SetState(VertexState.Failed);
}
else if (status.ToLower().StartsWith("vertex had errors"))
{
Match m = errorCodeRegex.Match(status);
// Vertex Had Errors, <ErrorCode>0x830a0017<!-- Vertex Had Errors --></ErrorCode>
if (m.Success)
this.ExitCode = m.Groups[1].Value;
else
this.ExitCode = status;
this.SetState(VertexState.Failed);
}
else if (status.ToLower().StartsWith("vertex received termination"))
{
this.SetState(VertexState.Cancelled);
}
else
{
this.SetState(VertexState.Failed);
}
if (time != DateTime.MinValue)
this.End = time;
return this.State == VertexState.Failed;
}
/// <summary>
/// Additional information about the vertex error.
/// </summary>
/// <param name="s">Information about error.</param>
internal void AddErrorString(string s)
{
if (this.error == null)
this.error = "";
this.error += "\n" + s;
}
/// <summary>
/// Exit code of vertex.
/// </summary>
public string ExitCode { get; set; }
/// <summary>
/// Return the error string.
/// </summary>
/// <returns>The error string as reported to the JM.</returns>
public string ErrorString
{
get
{
return this.error;
}
set
{
if (string.IsNullOrEmpty(this.error))
this.error = value;
else
this.error += " " + value;
}
}
/// <summary>
/// Parse a part of the 'originalInfo.txt' file to discover a set of channel endpoints.
/// </summary>
/// <param name="sr">Stream reader which contains the channel information.</param>
/// <returns>The list of channels, or null on failure.</returns>
/// <param name="uriprefix">If the channel is an output, prefix the path with this; this is null for inputs.</param>
/// <param name="skip">If true, do not return anything (still useful to advance the stream reader).</param>
/// <param name="fast">If true the channel sizes are not discovered; this is much faster, since no remote machines are queried for files.</param>
/// <param name="updateProgress">Delegate used to report progress.</param>
/// <param name="reporter">Delegate used to report errors.</param>
private Dictionary<int, ChannelEndpointDescription> DiscoverOriginalInfoChannels(ISharedStreamReader sr, string uriprefix, bool skip, bool fast, StatusReporter reporter, Action<int> updateProgress)
{
bool isInput = uriprefix == null;
string countline = sr.ReadLine();
if (countline == null)
return null;
int channelCount;
int spaceIndex = countline.IndexOf(' ');
if (spaceIndex > 0)
countline = countline.Substring(0, spaceIndex);
bool success = int.TryParse(countline, out channelCount);
if (!success)
return null;
var channels = new Dictionary<int, ChannelEndpointDescription>(channelCount);
for (int i = 0; i < channelCount; i++)
{
string channel = sr.ReadLine();
if (channel == null)
{
if (updateProgress != null)
updateProgress(100);
return null;
}
if (!skip)
{
ChannelEndpointDescription desc = new ChannelEndpointDescription(isInput, i, channel, uriprefix, fast, reporter);
channels.Add(i, desc);
if (updateProgress != null)
updateProgress(i * 100 / channelCount);
}
}
if (updateProgress != null)
updateProgress(100);
if (skip)
return null;
return channels;
}
/// <summary>
/// Discover the vertex channels in a vertex-*-rerun file.
/// </summary>
/// <returns>True if the discovery was successful.</returns>
/// <param name="inputs">If true discover the inputs.</param>
/// <param name="outputs">If true discover the outputs.</param>
/// <param name="fast">If true do not discover the channel sizes (much faster).</param>
/// <param name="progress">Delegate used to report progress.</param>
/// <param name="reporter">Delegate used to report errors.</param>
public bool DiscoverOriginalInfoChannels(bool inputs, bool outputs, bool fast, StatusReporter reporter, Action<int> progress)
{
string filename = string.Format("vertex-{0}-{1}-rerun-originalInfo.txt", this.Number, this.Version);
bool success = true;
// The format of this file is fixed.
if (this.InputChannels != null)
// skip discovery
inputs = false;
ISharedStreamReader sr = this.WorkDirectory.GetFile(filename).GetStream();
var channels = this.DiscoverOriginalInfoChannels(sr, null, !inputs, fast, reporter, progress);
if (channels == null)
{
if (inputs)
success = false;
}
else
this.InputChannels = channels;
if (this.OutputChannels != null)
// skip discovery
outputs = false;
channels = this.DiscoverOriginalInfoChannels(sr, this.WorkDirectory.ToString(), !outputs, fast, reporter, progress);
if (channels == null)
{
if (outputs)
success = false;
}
else
this.OutputChannels = channels;
sr.Close();
return success;
}
/// <summary>
/// Discover the vertex channels in a Scope-generated vcmdStart*xml file.
/// </summary>
/// <returns>True if the discovery was successful.</returns>
/// <param name="inputs">If true discover the inputs.</param>
/// <param name="outputs">If true discover the outputs.</param>
/// <param name="fast">If true do not discover the channel sizes (much faster).</param>
/// <param name="progress">Delegate used to report progress.</param>
/// <param name="reporter">Delegate used to report errors.</param>
// ReSharper disable UnusedParameter.Global
public bool DiscoverScopeChannels(bool inputs, bool outputs, bool fast, StatusReporter reporter, Action<int> progress)
// ReSharper restore UnusedParameter.Global
{
// find the xml file
var files = this.WorkDirectory.GetFilesAndFolders("vcmdStart*.xml").ToList();
if (files.Count != 1)
{
reporter("Cannot locate vcmdStart*.xml file", StatusKind.Error);
return false;
}
ISharedStreamReader sr = files.First().GetStream();
if (sr.Exception != null)
{
reporter("Error reading vcmdStart*.xml file" + sr.Exception.Message, StatusKind.Error);
return false;
}
// ReSharper disable PossibleNullReferenceException
XDocument plan = XDocument.Parse(sr.ReadToEnd());
if (inputs && this.InputChannels == null)
{
var channels = new Dictionary<int, ChannelEndpointDescription>();
IEnumerable<XElement> inputsData = plan.Root.Element("inputs").Elements();
int chno = 0;
foreach (var e in inputsData)
{
string chpath = e.Attribute("path").Value;
long size = long.Parse(e.Attribute("length").Value);
ChannelEndpointDescription desc = new ChannelEndpointDescription(true, chno, chpath, size);
channels.Add(chno, desc);
chno++;
}
this.InputChannels = channels;
}
if (outputs && this.OutputChannels == null)
{
var channels = new Dictionary<int, ChannelEndpointDescription>();
IEnumerable<XElement> inputsData = plan.Root.Element("outputs").Elements();
int chno = 0;
foreach (var e in inputsData)
{
string chpath = e.Attribute("path").Value;
ChannelEndpointDescription desc = new ChannelEndpointDescription(true, chno, chpath, -1);
channels.Add(chno, desc);
chno ++;
}
this.OutputChannels = channels;
}
// ReSharper restore PossibleNullReferenceException
sr.Close();
return true;
}
/// <summary>
/// Discover the input and output channels of the vertex. Populates the 'InputChannels' and 'OutputChannel' lists.
/// </summary>
/// <returns>True if the discovery was successful.</returns>
/// <param name="inputs">If true discover the inputs.</param>
/// <param name="outputs">If true discover the outputs.</param>
/// <param name="fast">If true do not discover the channel sizes (much faster).</param>
/// <param name="progress">Delegate used to report progress.</param>
/// <param name="reporter">Delegate used to report errors.</param>
public bool DiscoverChannels(bool inputs, bool outputs, bool fast, StatusReporter reporter, Action<int> progress)
{
// check if the result is already cached
if ((this.InputChannels != null || !inputs) &&
(this.OutputChannels != null || !outputs))
return true;
if (this.WorkDirectory == null || this.WorkDirectory.Exception != null)
return false;
// The format of this file is fixed.
if (!this.channelsAreFinal)
{
// invalidate cache
this.InputChannels = null;
this.OutputChannels = null;
}
bool result;
IClusterResidentObject wd = this.WorkDirectory;
if (wd is FolderInCachedCluster)
{
wd = (wd as FolderInCachedCluster).OriginalFolder;
}
if (wd is UNCFile)
{
result = this.DiscoverOriginalInfoChannels(inputs, outputs, fast, reporter, progress);
}
else
{
result = false;
}
if (this.VertexIsCompleted)
this.channelsAreFinal = true;
return result;
}
/// <summary>
/// If true the vertex is no longer running; some of its information can be cached.
/// </summary>
public bool VertexIsCompleted
{
get
{
switch (this.State)
{
case VertexState.Cancelled:
case VertexState.Abandoned:
case VertexState.Failed:
case VertexState.Successful:
case VertexState.Invalidated:
case VertexState.Revoked:
return true;
default:
return false;
}
}
}
/// <summary>
/// String representation of the executed vertex instance.
/// </summary>
/// <returns>A string briefly describing the executed vertex instance.</returns>
public override string ToString()
{
return string.Format("Vertex {0}.{1} ({2}) status {3}", this.Number, this.Version, this.Name, this.State);
}
/// <summary>
/// Replace the information in an executed vertex instance when a new vertex is created.
/// This can only happen in some cases when cancelled vertex numbers are reused.
/// </summary>
/// <param name="name">New vertex name.</param>
/// <param name="guid">New vertex guid.</param>
// ReSharper disable once UnusedParameter.Global
internal void Update(string name, string guid)
{
if (this.State != VertexState.Cancelled && this.State != VertexState.Abandoned)
throw new CalypsoDryadException("Updating a non-cancelled/abandoned vertex");
if (this.Name != name)
throw new CalypsoDryadException("Vertex changed name");
this.UniqueID = guid;
this.SetState(VertexState.Created);
// the stdoutfile is expected to change, so I don't invalidate the cache
}
/// <summary>
/// Set the vertex state.
/// </summary>
/// <param name="state">New vertex state.</param>
public void SetState(VertexState state)
{
this.State = state;
bool cache = this.VertexIsCompleted;
if (this.StdoutFile != null)
this.StdoutFile.ShouldCacheLocally = cache;
if (this.LogDirectory != null)
this.LogDirectory.ShouldCacheLocally = cache;
if (this.WorkDirectory != null)
this.WorkDirectory.ShouldCacheLocally = cache;
}
/// <summary>
/// A CSV header matching the AsCSV data.
/// </summary>
/// <returns>A string describing the CSV header for a vertex executed instance.</returns>
public static string CSV_Header()
{
return "Name,Stage,Start,End,Running time,State,Data Read,Data Written,Version,Machine,Process ID";
}
/// <summary>
/// CSV representation of the information about an executed vertex instance.
/// </summary>
/// <returns>The information in CSV format, matching the CSV_Header.</returns>
public string AsCSV()
{
string start = this.Start != DateTime.MinValue ? this.Start.ToString("s") : "";
string end = this.End != DateTime.MinValue ? this.End.ToString("s") : "";
string running = this.RunningTime > TimeSpan.Zero ? this.RunningTime.ToString() : "";
return string.Format("{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}",
this.Name, this.StageName, start, end, running, this.State, this.DataRead, this.DataWritten, this.Version, this.Machine, this.ProcessIdentifier);
}
}
}