fix memory bug, minor other changes
This commit is contained in:
parent
5ad103cba7
commit
86242db082
|
|
@ -77,7 +77,7 @@
|
|||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.0.0, Culture=neutral, processorArchitecture=AMD64">
|
||||
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.2.0, Culture=neutral, processorArchitecture=AMD64">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
|
||||
</Reference>
|
||||
|
|
|
|||
|
|
@ -115,10 +115,12 @@
|
|||
<Reference Include="Microsoft.Research.Peloponnese.HadoopBridge, Version=0.0.0.0, Culture=neutral, processorArchitecture=AMD64">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.2.0, Culture=neutral, processorArchitecture=AMD64">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Threading.Tasks">
|
||||
<HintPath>..\packages\Microsoft.Bcl.Async.1.0.166\lib\net40\Microsoft.Threading.Tasks.dll</HintPath>
|
||||
|
|
|
|||
|
|
@ -284,7 +284,7 @@ namespace Microsoft.Research.DryadLinq
|
|||
{
|
||||
if (Context.PeloponneseHomeDirectory == null)
|
||||
{
|
||||
throw new ApplicationException("No Peloponnese home directory is set");
|
||||
Context.PeloponneseHomeDirectory = Context.DryadHomeDirectory;
|
||||
}
|
||||
if (Context.DryadHomeDirectory == null)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -267,7 +267,7 @@ namespace Microsoft.Research.DryadLinq
|
|||
{
|
||||
if (Context.PeloponneseHomeDirectory == null)
|
||||
{
|
||||
throw new ApplicationException("No Peloponnese home directory is set");
|
||||
Context.PeloponneseHomeDirectory = Context.DryadHomeDirectory;
|
||||
}
|
||||
if (Context.DryadHomeDirectory == null)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -94,6 +94,14 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
/// </summary>
|
||||
private TaskCompletionSource<Process> finishWaiter;
|
||||
|
||||
/// <summary>
|
||||
/// children that will be cancelled when finishWaiter is unblocked. These are materialized into a
|
||||
/// separate set so that they can be discarded when they are no longer needed. Otherwise, when
|
||||
/// used in WhenAny internally, they lead to the GC holding onto any other tasks in the WhenAny
|
||||
/// clause until finishWaiter completes, which is a huge memory leak
|
||||
/// </summary>
|
||||
private HashSet<TaskCompletionSource<Process>> childFinishWaiters;
|
||||
|
||||
/// <summary>
|
||||
/// this blocks until the command loop exits
|
||||
/// </summary>
|
||||
|
|
@ -140,6 +148,8 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
// make the Task that CommandLoop blocks on; when finishWaiter is started it returns null
|
||||
// causing CommandLoop to exit.
|
||||
finishWaiter = new TaskCompletionSource<Process>();
|
||||
childFinishWaiters = new HashSet<TaskCompletionSource<Process>>();
|
||||
finishWaiter.Task.ContinueWith((t) => Task.Run(() => SetChildFinishWaiters()));
|
||||
|
||||
// this is started when the Command Loop exits
|
||||
exited = new TaskCompletionSource<bool>();
|
||||
|
|
@ -201,9 +211,54 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// a task that can be awaited and will asynchronously unblock when the finishWaiter result is set
|
||||
/// set all the pending cancellations from the master finishWaiter
|
||||
/// </summary>
|
||||
private Task<Process> AsyncFinishWaiter { get { return finishWaiter.Task.ContinueWith((t) => t.Result); } }
|
||||
private void SetChildFinishWaiters()
|
||||
{
|
||||
lock (this)
|
||||
{
|
||||
foreach (TaskCompletionSource<Process> waiter in childFinishWaiters)
|
||||
{
|
||||
waiter.SetResult(finishWaiter.Task.Result);
|
||||
}
|
||||
childFinishWaiters = null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// get a task that can be awaited and will asynchronously unblock when the finishWaiter result is set
|
||||
/// </summary>
|
||||
private TaskCompletionSource<Process> GetAsyncFinishWaiter()
|
||||
{
|
||||
TaskCompletionSource<Process> thisCompletion = new TaskCompletionSource<Process>();
|
||||
lock (this)
|
||||
{
|
||||
if (childFinishWaiters == null)
|
||||
{
|
||||
thisCompletion.SetResult(finishWaiter.Task.Result);
|
||||
}
|
||||
else
|
||||
{
|
||||
childFinishWaiters.Add(thisCompletion);
|
||||
}
|
||||
}
|
||||
return thisCompletion;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// take the finish waiter out of the list of pending waiters, since its target has completed
|
||||
/// </summary>
|
||||
/// <param name="waiter">waiter to remove</param>
|
||||
private void RemoveAsyncFinishWaiter(TaskCompletionSource<Process> waiter)
|
||||
{
|
||||
lock (this)
|
||||
{
|
||||
if (childFinishWaiters != null)
|
||||
{
|
||||
childFinishWaiters.Remove(waiter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// (asynchronously) block until there is a process available on the local queue, the rack queue
|
||||
|
|
@ -248,7 +303,9 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
|
||||
// we want to wait either for waiter to be matched with a Process in one of the three queues, or
|
||||
// for ShutDown to be called, so make an array of tasks and wait for the first one to be unblocked.
|
||||
var unblocked = await Task.WhenAny(blocker, AsyncFinishWaiter);
|
||||
TaskCompletionSource<Process> thisWaiter = GetAsyncFinishWaiter();
|
||||
var unblocked = await Task.WhenAny(blocker, thisWaiter.Task);
|
||||
RemoveAsyncFinishWaiter(thisWaiter);
|
||||
|
||||
if (unblocked.Result != null)
|
||||
{
|
||||
|
|
@ -341,7 +398,9 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
{
|
||||
logger.Log("Computer " + name + " reporting match with process " + process.Id);
|
||||
|
||||
await process.OnScheduled(this, nextTask, AsyncFinishWaiter, null);
|
||||
TaskCompletionSource<Process> thisWaiter = GetAsyncFinishWaiter();
|
||||
await process.OnScheduled(this, nextTask, thisWaiter.Task, null);
|
||||
RemoveAsyncFinishWaiter(thisWaiter);
|
||||
|
||||
logger.Log("Computer " + name + " waiting for process " + process.Id + " to complete");
|
||||
|
||||
|
|
|
|||
|
|
@ -137,7 +137,7 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
public void DecrementQueueCount()
|
||||
{
|
||||
Debug.Assert(queueCount > 0);
|
||||
++queueCount;
|
||||
--queueCount;
|
||||
if (owner == null && !scheduling && queueCount == 0)
|
||||
{
|
||||
// the queue count has dropped to zero without the process being matched
|
||||
|
|
|
|||
|
|
@ -131,6 +131,11 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
{
|
||||
processQueue = new Queue<Process>();
|
||||
waiterQueue = new Queue<ProcessWaiter>();
|
||||
|
||||
// start background cleaning tasks
|
||||
CleanProcessQueue();
|
||||
CleanWaiterQueue();
|
||||
|
||||
active = true;
|
||||
}
|
||||
|
||||
|
|
@ -164,6 +169,76 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// background thread to periodically remove any claimed processes, so we don't
|
||||
/// hang on to memory indefinitely
|
||||
/// </summary>
|
||||
private async void CleanProcessQueue()
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
lock (this)
|
||||
{
|
||||
if (processQueue == null)
|
||||
{
|
||||
// we have shut down, so exit this daemon
|
||||
return;
|
||||
}
|
||||
|
||||
Queue<Process> cleanedQueue = new Queue<Process>();
|
||||
foreach (Process p in processQueue)
|
||||
{
|
||||
lock (p)
|
||||
{
|
||||
if (p.Unclaimed)
|
||||
{
|
||||
cleanedQueue.Enqueue(p);
|
||||
}
|
||||
}
|
||||
}
|
||||
processQueue = cleanedQueue;
|
||||
}
|
||||
|
||||
// clean again in a second
|
||||
await Task.Delay(1000);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// background thread to periodically remove any claimed waiters, so we don't
|
||||
/// hang on to memory indefinitely
|
||||
/// </summary>
|
||||
private async void CleanWaiterQueue()
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
lock (this)
|
||||
{
|
||||
if (processQueue == null)
|
||||
{
|
||||
// we have shut down, so exit this daemon
|
||||
return;
|
||||
}
|
||||
|
||||
Queue<ProcessWaiter> cleanedQueue = new Queue<ProcessWaiter>();
|
||||
foreach (ProcessWaiter w in waiterQueue)
|
||||
{
|
||||
lock (w)
|
||||
{
|
||||
if (w.Unclaimed)
|
||||
{
|
||||
cleanedQueue.Enqueue(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
waiterQueue = cleanedQueue;
|
||||
}
|
||||
|
||||
// clean again in a second
|
||||
await Task.Delay(1000);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// add a schedulable process. If there is an unclaimed computer waiting, the
|
||||
/// process will be assigned to the computer and the computer's Task will be
|
||||
|
|
@ -305,7 +380,7 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
|||
}
|
||||
}
|
||||
|
||||
// even if there are processes, they may have been claimed by ther computers
|
||||
// even if there are processes, they may have been claimed by other computers
|
||||
// already, so use a loop here
|
||||
while (active && processQueue.Count > 0 && !claimed)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -127,6 +127,11 @@
|
|||
a completed application. We have tried to report errors in user application code back so that they are visible in the
|
||||
<link xlink:href="91822db3-8a00-4307-ad8a-595c94f449b0">DryadLINQ Job Browser</link> to avoid the need to consult
|
||||
the logs.</para>
|
||||
<para>
|
||||
<mediaLinkInline>
|
||||
<image xlink:href="Dryad on Azure Architecture"/>
|
||||
</mediaLinkInline>
|
||||
</para>
|
||||
</content>
|
||||
</conclusion>
|
||||
</procedure>
|
||||
|
|
|
|||
Loading…
Reference in New Issue