/* Copyright (c) Microsoft Corporation All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT. See the Apache Version 2.0 License for specific language governing permissions and limitations under the License. */ using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Globalization; using System.Reflection; using System.Linq.Expressions; using System.Linq; using System.Diagnostics; using Microsoft.Research.DryadLinq; #pragma warning disable 1591 namespace Microsoft.Research.DryadLinq.Internal { public static class DryadLinqSampler { internal const double SAMPLE_RATE = 0.001; private const int MAX_SECOND_PHASE_SAMPLES = 1024*1024; [Resource(IsStateful=false)] public static IEnumerable Phase1Sampling(IEnumerable source, Func keySelector, VertexEnv denv) { // note: vertexID is constant for each repetition of a specific vertex (eg in fail-and-retry scenarios) // this is very good as it ensure the sampling is idempotent w.r.t. retries. long vertexID = DryadLinqNative.GetVertexId(denv.NativeHandle); int seed = unchecked((int)(vertexID)); long nEmitted = 0; Random rdm = new Random(seed); List allSoFar = new List(); List samples = new List(); // try to collect 10 samples, but keep all the records just in case IEnumerator sourceEnumerator = source.GetEnumerator(); while (sourceEnumerator.MoveNext()) { T elem = sourceEnumerator.Current; K key = keySelector(elem); allSoFar.Add(key); if (rdm.NextDouble() < SAMPLE_RATE) { samples.Add(key); if (samples.Count >= 10) break; } } if (samples.Count >= 10) { // we have lots of samples.. emit them and continue sampling allSoFar = null; // not needed. foreach (K key in samples) { yield return key; nEmitted++; } while (sourceEnumerator.MoveNext()) { T elem = sourceEnumerator.Current; if (rdm.NextDouble() < SAMPLE_RATE) { yield return keySelector(elem); nEmitted++; } } } else { // sampling didn't produce much, so emit all the records instead. DryadLinqLog.AddInfo("Sampling produced only {0} records. Emitting all records instead.", samples.Count()); Debug.Assert(sourceEnumerator.MoveNext() == false, "The source enumerator wasn't finished"); samples = null; // the samples list is not needed. foreach (K key in allSoFar) { yield return key; nEmitted++; } } DryadLinqLog.AddInfo("Stage1 sampling: num keys emitted = {0}", nEmitted); } //------------------------------------ //Range-sampler // 1. Secondary sampling // 2. sort, and select separator values. //This method is only used for dynamic inputs. Not required in RTM //public static IEnumerable RangeSampler_Dynamic(IEnumerable source, // Func keySelector, // IComparer comparer, // bool isDescending, // VertexEnv denv) //{ // if (denv.NumberOfArguments < 2) // { // throw new DryadLinqException(SR.Sampler_NotEnoughArgumentsForVertex); // } // Int32 pcount = Int32.Parse(denv.GetArgument(denv.NumberOfArguments-1)); // return RangeSamplerCore(source, keySelector, comparer, isDescending, pcount); //} // used for static plan (ie pcount is determined on client-side and baked into vertex code) public static IEnumerable RangeSampler_Static(IEnumerable firstPhaseSamples, IComparer comparer, bool isDescending, int pcount) { return RangeSamplerCore(firstPhaseSamples, comparer, isDescending, pcount); } public static IEnumerable RangeSamplerCore(IEnumerable firstPhaseSamples, IComparer comparer, bool isDescending, int pcount) { //Reservoir sampling to produce at most MAX_SECOND_PHASE_SAMPLES records. K[] samples = new K[MAX_SECOND_PHASE_SAMPLES]; int inputCount = 0; int reservoirCount = 0; // fixed-seed is ok here as second-phase-sampler is a singleton vertex. Idempotency is important. Random r = new Random(314159); foreach (K key in firstPhaseSamples) // this completely enumerates each source in turn. { if (inputCount < MAX_SECOND_PHASE_SAMPLES) { samples[reservoirCount] = key; inputCount++; reservoirCount++; } else { int idx = r.Next(inputCount); // ie a number between 0..inputCount-1 inclusive. if (idx < MAX_SECOND_PHASE_SAMPLES) { samples[idx] = key; } inputCount++; } } // Sort and Emit the keys Array.Sort(samples, 0, reservoirCount, comparer); DryadLinqLog.AddVerbose("Range-partition separator keys: "); DryadLinqLog.AddVerbose("samples: {0}", reservoirCount); DryadLinqLog.AddVerbose("pCount: {0}", pcount); if (reservoirCount == 0) { DryadLinqLog.AddVerbose(" case: cnt==0. No separators produced."); yield break; } if (reservoirCount < pcount) { //DryadLinqLog.AddVerbose(" case: cnt < pcount"); if (isDescending) { //DryadLinqLog.AddVerbose(" case: isDescending=true"); for (int i = reservoirCount - 1; i >= 0; i--) { //DryadLinqLog.AddVerbose(" [{0}]", samples[i]); yield return samples[i]; } K first = samples[0]; for (int i = reservoirCount; i < pcount - 1; i++) { //DryadLinqLog.AddVerbose(" [{0}]", first); yield return first; } } else { //DryadLinqLog.AddVerbose(" case: isDescending=false"); for (int i = 0; i < reservoirCount; i++) { //DryadLinqLog.AddVerbose(" [{0}]", samples[i]); yield return samples[i]; } K last = samples[reservoirCount - 1]; for (int i = reservoirCount; i < pcount - 1; i++) { //DryadLinqLog.AddVerbose(" [{0}]", last); yield return last; } } } else { //DryadLinqLog.AddVerbose(" case: cnt >= pcount"); int intv = reservoirCount / pcount; if (isDescending) { //DryadLinqLog.AddVerbose(" case: isDescending=true"); int idx = reservoirCount - intv; for (int i = 0; i < pcount-1; i++) { //DryadLinqLog.AddVerbose(" [{0}]", samples[idx]); yield return samples[idx]; idx -= intv; } } else { //DryadLinqLog.AddVerbose(" case: isDescending=false"); int idx = intv; for (int i = 0; i < pcount-1; i++) { //DryadLinqLog.AddVerbose(" [{0}]", samples[idx]); yield return samples[idx]; idx += intv; } } } } } }