Dryad/GraphManager/reporting/DrArtemisLegacyReporting.cpp

360 lines
14 KiB
C++

/*
Copyright (c) Microsoft Corporation
All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License
at http://www.apache.org/licenses/LICENSE-2.0
THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
See the Apache Version 2.0 License for specific language governing permissions and
limitations under the License.
*/
#include <DrStageHeaders.h>
#include <DrArtemisLegacyReporting.h>
//
// Prints a timestamp as MM/DD/YYYY HH:MM:SS.MS
//
static void PrintTimestamp()
{
SYSTEMTIME utc, local;
FILETIME ft;
GetSystemTimeAsFileTime(&ft);
FileTimeToSystemTime(&ft, &utc);
SystemTimeToTzSpecificLocalTime(NULL, &utc, &local);
printf("[%02d/%02d/%04d %02d:%02d:%02d.%03u] ",
local.wMonth,
local.wDay,
local.wYear,
local.wHour,
local.wMinute,
local.wSecond,
local.wMilliseconds);
}
//
// Prints a timestamp as MM/DD/YYYY HH:MM:SS.MS
//
static FILETIME ConvertToFileTime(DrDateTime timestamp)
{
union {
FILETIME ft;
DrDateTime ts;
};
ts = timestamp;
return ft;
}
/*
// Abandoning duplicate scheduling of vertex 83.1 (InputTable__26[5])
vertexAbandonedRegex = new Regex(@"Abandoning duplicate scheduling of vertex (\d+)\.(\d+) \((.+)\)", RegexOptions.Compiled);
// BasicAggregate__10(1),(Super__3[0].0*Super__3[1].0*Super__3[2].0*Super__3[3].0*Super__3[4].0*Super__3[5].0*Super__3[6].0),(sherwood-068),0.1718728s
topologyRegex = new Regex(@"(.*),\((.*)\),\((\S+)\),([.0-9]+)s$",
RegexOptions.Compiled);
// total=951722563162 local=37817665237 intrapod=189765117248 crosspod=724139780677
datareadRegex = new Regex(@"total=(\d+) local=(\d+) intrapod=(\d+) crosspod=(\d+)", RegexOptions.Compiled);
Also:
Timing Information Graph Start Time (\d+)
JM Finish Time: (\d+)
ABORTING:
\<ErrorString\>(.*)\</ErrorString\>
Application completed successfully.
Application failed with error code (.*)
Total running time in vertices successful/failed: ([.0-9]+)s\/([.0-9]+)s
Average job parallelism
*/
/*
// Created process execution record for vertex 33 (Super__0[0]) v.0 GUID {B0FC788F-1FFC-4D74-AFC4-3EDFF03AF11A}
vertexCreatedRegex = new Regex(@"\[(.*)\] Created process execution record for vertex (\d+) \((.*)\) v.(\d+) GUID \{?([-A-F0-9]+)\}?",
RegexOptions.Compiled);
// Process started for vertex 5 (Super__0[1]) v.0 GUID {73EA55E0-0326-43C4-AD61-CB0B8CF8FE49} machine sherwood-025
// Process started for vertices 23 (Merge__29) 24 (Apply__33) v.0 GUID {E945DC5D-9AF6-4732-8770-2A6BF7FA3041} machine sherwood-237
vertexStartRegex = new Regex(@"\[(.*)\] Process started for vert(\w+) (.*) v\.(.*) GUID \{?([-A-F0-9]+)\}? machine (\S+)",
RegexOptions.Compiled);
*/
void DrArtemisLegacyReporter::ReceiveMessage(DrProcessInfoRef info)
{
DrString processName;
{
DrLockBoxKey<DrProcess> process(info->m_process);
processName = process->GetName();
}
if (info->m_state->m_state == DPS_Starting)
{
PrintTimestamp();
printf("Created process execution record for %s GUID {%s}\n",
processName.GetChars(), info->m_state->m_process->GetHandleIdAsString().GetChars());
PrintTimestamp();
printf("Process started for %s GUID {%s} machine %s\n",
processName.GetChars(), info->m_state->m_process->GetHandleIdAsString().GetChars(),
info->m_state->m_process->GetAssignedNode()->GetLocality().GetChars());
fflush(stdout);
}
}
/*
// Vertex 5.0 (Super__0[1]) machine sherwood-025 guid {73EA55E0-0326-43C4-AD61-CB0B8CF8FE49} status Vertex Has Completed,
terminationRegex = new Regex(@"Vertex (\d+)\.(\d+) \((.+)\) machine (\S+) guid \{?([-0-9A-F]+)\}? status (.*)",
RegexOptions.Compiled);
// Canceling vertex 1461.0 (Merge__13[258]) due to dependent failure
cancelRegex = new Regex(@"\[(.*)\] Canceling vertex (\d+)\.(\d+) \((.+)\) due to (.*)", RegexOptions.Compiled);
// Process was terminated Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 status The operation succeeded
terminatedRegex = new Regex(@"\[(.*)\] Process was terminated Vertex (\d+)\.(\d+) \((.+)\) GUID \{?([-A-F0-9]+)\}? machine (\S+) status (.*)",
RegexOptions.Compiled);
// Process has failed Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 status The operation succeeded
failedRegex = new Regex(@"\[(.*)\] Process has failed Vertex (\d+)\.(\d+) \((.+)\) GUID \{?([-A-F0-9]+)\}? machine (\S+) Exitcode (.*)",
RegexOptions.Compiled);
// Timing Information 5 1 Super__0[1] 128654556602334453 0.0000 0.0000 0.0000 0.0000 0.2969
timingInfoRegex = new Regex(@"Timing Information (\d+) (\d+) (.+) (\d+) ([-.0-9]+) ([-.0-9]+) ([-.0-9]+) ([-.0-9]+) ([-.0-9]+)",
RegexOptions.Compiled);
*/
void DrArtemisLegacyReporter::ReceiveMessage(DrVertexInfoRef info)
{
DrString processGuid = "(no guid)";
DrString machineName = "(no computer)";
DrDateTime processSchedule = DrDateTime_Never;
DrDateTime processStartCreate = DrDateTime_Never;
DrDateTime processFinishCreate = DrDateTime_Never;
if (info->m_process.IsEmpty() == false)
{
DrLockBoxKey<DrProcess> process(info->m_process);
DrProcessHandlePtr handle = process->GetInfo()->m_state->m_process;
if (handle != DrNull)
{
processGuid = handle->GetHandleIdAsString();
if (handle->GetAssignedNode() != DrNull)
{
machineName = handle->GetAssignedNode()->GetLocality();
}
}
processSchedule = process->GetInfo()->m_jmProcessScheduledTime;
processStartCreate = process->GetInfo()->m_state->m_createdTime;
processFinishCreate = process->GetInfo()->m_state->m_beginExecutionTime;
}
if (info->m_state == DVS_Completed)
{
PrintTimestamp();
printf("Vertex %d.%d (%s) machine %s guid {%s} status Vertex Has Completed\n",
info->m_info->GetProcessStatus()->GetVertexId(),
info->m_info->GetProcessStatus()->GetVertexInstanceVersion(),
info->m_name.GetChars(), machineName.GetChars(), processGuid.GetChars());
}
else if (info->m_state == DVS_Failed)
{
if (info->m_statistics->m_exitStatus == DrError_CohortShutdown)
{
PrintTimestamp();
printf("Canceling vertex %d.%d (%s) due to dependent failure\n",
info->m_info->GetProcessStatus()->GetVertexId(),
info->m_info->GetProcessStatus()->GetVertexInstanceVersion(),
info->m_name.GetChars());
}
if (info->m_statistics->m_exitStatus == DrError_Unexpected)
{
PrintTimestamp();
printf("Process was terminated Vertex %d.%d (%s) GUID {%s} machine %s status The operation succeeded\n",
info->m_info->GetProcessStatus()->GetVertexId(),
info->m_info->GetProcessStatus()->GetVertexInstanceVersion(),
info->m_name.GetChars(), processGuid.GetChars(), machineName.GetChars());
}
else
{
PrintTimestamp();
printf("Process has failed Vertex %d.%d (%s) GUID {%s} machine %s Exitcode %x\n",
info->m_info->GetProcessStatus()->GetVertexId(),
info->m_info->GetProcessStatus()->GetVertexInstanceVersion(),
info->m_name.GetChars(), processGuid.GetChars(), machineName.GetChars(),
info->m_statistics->m_exitStatus);
}
}
if (info->m_state == DVS_Completed || info->m_state == DVS_Failed)
{
if ((info->m_statistics != DrNull)
&& (info->m_statistics->m_totalInputData != DrNull)
&& (info->m_statistics->m_totalOutputData != DrNull))
{
printf("Io information %d %d %s read %I64u wrote %I64u\n",
info->m_info->GetProcessStatus()->GetVertexId(),
info->m_info->GetProcessStatus()->GetVertexInstanceVersion(),
info->m_name.GetChars(),
info->m_statistics->m_totalInputData->m_dataRead,
info->m_statistics->m_totalOutputData->m_dataWritten);
printf("Io locality information %d %d %s read %I64u ( %I64u local )\n",
info->m_info->GetProcessStatus()->GetVertexId(),
info->m_info->GetProcessStatus()->GetVertexInstanceVersion(),
info->m_name.GetChars(),
info->m_statistics->m_totalInputData->m_dataRead,
info->m_statistics->m_totalLocalInputData);
}
}
if (info->m_state >= DVS_Completed)
{
DrVertexExecutionStatisticsPtr eStats = info->m_statistics;
DrDateTime execRunning = eStats->m_runningTime;
DrDateTime completion = eStats->m_completionTime;
if ((processSchedule == DrDateTime_Never) || (processSchedule < eStats->m_creationTime))
{
/* this vertex is part of a cohort, and was created after the
process had already started */
processSchedule = eStats->m_creationTime;
}
if ((processStartCreate == DrDateTime_Never) || (processStartCreate < processSchedule))
{
/* this vertex is part of a cohort, and was created after the
process had already started, or clock skew means it appears to have started before it
was scheduled */
processStartCreate = processSchedule;
}
if ((processFinishCreate == DrDateTime_Never) || (processFinishCreate < processStartCreate))
{
/* this vertex is part of a cohort, and was created after the
process creation had already completed */
processFinishCreate = processStartCreate;
}
if ((execRunning == DrDateTime_Never) || (execRunning < processFinishCreate))
{
/* this vertex had never run */
execRunning = processFinishCreate;
}
if ((completion == DrDateTime_Never) || (completion < execRunning))
{
/* clock skew??? */
completion = execRunning;
}
double creatToScheduleTime =
(double) (processSchedule - eStats->m_creationTime) / (double) DrTimeInterval_Second;
double schedToStartProcessTime =
(double) (processStartCreate - processSchedule) / (double) DrTimeInterval_Second;
double pStartToCreatedProcessTime =
(double) (processFinishCreate - processStartCreate) / (double) DrTimeInterval_Second;
double cProcessToRunTime =
(double) (execRunning - processFinishCreate) / (double) DrTimeInterval_Second;
double runToCompTime =
(double) (completion - execRunning) / (double) DrTimeInterval_Second;
// No need to print timestamp for timing report
printf("Timing Information %u %u %s %I64u %.4f %.4f %.4f %.4f %.4f\n",
info->m_info->GetProcessStatus()->GetVertexId(),
info->m_info->GetProcessStatus()->GetVertexInstanceVersion(),
info->m_name.GetChars(),
eStats->m_creationTime,
creatToScheduleTime, schedToStartProcessTime, pStartToCreatedProcessTime,
cProcessToRunTime, runToCompTime);
fflush(stdout);
}
}
void DrArtemisLegacyReporter::ReportFinalTopology(DrVertexPtr vertex, DrResourcePtr runningMachine,
DrTimeInterval runningTime)
{
DrString machineName = "nowhere";
if (runningMachine != DrNull)
{
machineName = runningMachine->GetName();
}
// No need to print timestamp for topology reporting
printf("%s(%d),(", vertex->GetName().GetChars(), vertex->GetOutputs()->GetNumberOfEdges());
int i;
for (i=0; i<vertex->GetInputs()->GetNumberOfEdges(); ++i)
{
DrEdge e = vertex->GetInputs()->GetEdge(i);
if (e.m_remoteVertex == DrNull)
{
printf("%sNULL", (i > 0) ? "*" : "");
}
else
{
printf("%s%s.%d", (i > 0) ? "*" : "", e.m_remoteVertex->GetName().GetChars(), e.m_remotePort);
}
}
printf("),(%s),%lfs\n", machineName.GetChars(), (double) runningTime / (double) DrTimeInterval_Second);
fflush(stdout);
}
void DrArtemisLegacyReporter::ReportStart(DrDateTime startTime)
{
FILETIME ft = ConvertToFileTime(startTime);
SYSTEMTIME utc, local;
FileTimeToSystemTime(&ft, &utc);
SystemTimeToTzSpecificLocalTime(NULL, &utc, &local);
printf("Start time: %02d/%02d/%04d %02d:%02d:%02d.%03u\n",
local.wMonth,
local.wDay,
local.wYear,
local.wHour,
local.wMinute,
local.wSecond,
local.wMilliseconds);
fflush(stdout);
}
void DrArtemisLegacyReporter::ReportStop(UINT exitCode, DrNativeString /*errorString*/, DrDateTime /*stopTime*/)
{
SYSTEMTIME utc, local;
FILETIME ft;
GetSystemTimeAsFileTime(&ft);
FileTimeToSystemTime(&ft, &utc);
SystemTimeToTzSpecificLocalTime(NULL, &utc, &local);
printf("Stop time (Exit code = %u): %02d/%02d/%04d %02d:%02d:%02d.%03u\n",
exitCode,
local.wMonth,
local.wDay,
local.wYear,
local.wHour,
local.wMinute,
local.wSecond,
local.wMilliseconds
);
fflush(stdout);
}