broadinstitute
diff --git a/‎src/main/java/org/broadinstitute/hellbender/engine/ReadsContext.java
+12-1 b/‎src/main/java/org/broadinstitute/hellbender/engine/ReadsContext.java
+12-1
diff --git a/‎src/main/java/org/broadinstitute/hellbender/engine/filters/VariantFilterLibrary.java
+3-1 b/‎src/main/java/org/broadinstitute/hellbender/engine/filters/VariantFilterLibrary.java
+3-1
diff --git a/‎src/main/java/org/broadinstitute/hellbender/engine/spark/AssemblyRegionWalkerSpark.java
+189 b/‎src/main/java/org/broadinstitute/hellbender/engine/spark/AssemblyRegionWalkerSpark.java
+189
diff --git a/‎src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java
+19 b/‎src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java
+19
diff --git a/‎src/main/java/org/broadinstitute/hellbender/engine/spark/IntervalWalkerSpark.java
+78 b/‎src/main/java/org/broadinstitute/hellbender/engine/spark/IntervalWalkerSpark.java
+78
@@ -20,6 +20,7 @@
 public final class ReadsContext implements Iterable<GATKRead> {
 
     private final ReadsDataSource dataSource;
+    private final Iterable<GATKRead>  iterable;
 
     private final SimpleInterval interval;
 
@@ -41,16 +42,23 @@ public ReadsContext() {
      */
     public ReadsContext( final ReadsDataSource dataSource, final SimpleInterval interval ) {
         this.dataSource = dataSource;
+        this.iterable = null;
         this.interval = interval;
     }
 
+    public ReadsContext( Shard<GATKRead> shard ) {
+        this.dataSource = null;
+        this.iterable = shard;
+        this.interval = shard.getInterval();
+    }
+
     /**
      * Does this context have a backing source of reads data?
      *
      * @return true if there is a backing ReadsDataSource, otherwise false
      */
     public boolean hasBackingDataSource() {
-        return dataSource != null;
+        return dataSource != null || iterable != null;
     }
 
     /**
@@ -71,6 +79,9 @@ public SimpleInterval getInterval() {
      */
     @Override
     public Iterator<GATKRead> iterator() {
+        if (iterable != null && interval != null) {
+            return iterable.iterator();
+        }
         // We can't perform a query if we lack either a dataSource or an interval to query on
         if ( dataSource == null || interval == null ) {
             return Collections.<GATKRead>emptyList().iterator();
 
@@ -1,8 +1,10 @@
 package org.broadinstitute.hellbender.engine.filters;
 
+import java.io.Serializable;
+
 /**
  * Collects common variant filters.
  */
 public final class VariantFilterLibrary {
-    public static VariantFilter ALLOW_ALL_VARIANTS = variant -> true;
+    public static VariantFilter ALLOW_ALL_VARIANTS = (VariantFilter & Serializable) variant -> true;
 }
@@ -0,0 +1,189 @@
+package org.broadinstitute.hellbender.engine.spark;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMSequenceDictionary;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.broadcast.Broadcast;
+import org.broadinstitute.hellbender.cmdline.Advanced;
+import org.broadinstitute.hellbender.cmdline.Argument;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
+import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
+import org.broadinstitute.hellbender.utils.IntervalUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import scala.Tuple3;
+
+import javax.annotation.Nullable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * A Spark version of {@link AssemblyRegionWalker}.
+ */
+public abstract class AssemblyRegionWalkerSpark extends GATKSparkTool {
+    private static final long serialVersionUID = 1L;
+
+    @Argument(fullName="readShardSize", shortName="readShardSize", doc = "Maximum size of each read shard, in bases. For good performance, this should be much larger than the maximum assembly region size.", optional = true)
+    protected int readShardSize = defaultReadShardSize();
+
+    @Argument(fullName="readShardPadding", shortName="readShardPadding", doc = "Each read shard has this many bases of extra context on each side. Read shards must have as much or more padding than assembly regions.", optional = true)
+    protected int readShardPadding = defaultReadShardPadding();
+
+    @Argument(fullName = "minAssemblyRegionSize", shortName = "minAssemblyRegionSize", doc = "Minimum size of an assembly region", optional = true)
+    protected int minAssemblyRegionSize = defaultMinAssemblyRegionSize();
+
+    @Argument(fullName = "maxAssemblyRegionSize", shortName = "maxAssemblyRegionSize", doc = "Maximum size of an assembly region", optional = true)
+    protected int maxAssemblyRegionSize = defaultMaxAssemblyRegionSize();
+
+    @Argument(fullName = "assemblyRegionPadding", shortName = "assemblyRegionPadding", doc = "Number of additional bases of context to include around each assembly region", optional = true)
+    protected int assemblyRegionPadding = defaultAssemblyRegionPadding();
+
+    @Argument(fullName = "maxReadsPerAlignmentStart", shortName = "maxReadsPerAlignmentStart", doc = "Maximum number of reads to retain per alignment start position. Reads above this threshold will be downsampled. Set to 0 to disable.", optional = true)
+    protected int maxReadsPerAlignmentStart = defaultMaxReadsPerAlignmentStart();
+
+    @Advanced
+    @Argument(fullName = "activeProbabilityThreshold", shortName = "activeProbabilityThreshold", doc="Minimum probability for a locus to be considered active.", optional = true)
+    protected double activeProbThreshold = defaultActiveProbThreshold();
+
+    @Advanced
+    @Argument(fullName = "maxProbPropagationDistance", shortName = "maxProbPropagationDistance", doc="Upper limit on how many bases away probability mass can be moved around when calculating the boundaries between active and inactive assembly regions", optional = true)
+    protected int maxProbPropagationDistance = defaultMaxProbPropagationDistance();
+
+    /**
+     * @return Default value for the {@link #readShardSize} parameter, if none is provided on the command line
+     */
+    protected abstract int defaultReadShardSize();
+
+    /**
+     * @return Default value for the {@link #readShardPadding} parameter, if none is provided on the command line
+     */
+    protected abstract int defaultReadShardPadding();
+
+    /**
+     * @return Default value for the {@link #minAssemblyRegionSize} parameter, if none is provided on the command line
+     */
+    protected abstract int defaultMinAssemblyRegionSize();
+
+    /**
+     * @return Default value for the {@link #maxAssemblyRegionSize} parameter, if none is provided on the command line
+     */
+    protected abstract int defaultMaxAssemblyRegionSize();
+
+    /**
+     * @return Default value for the {@link #assemblyRegionPadding} parameter, if none is provided on the command line
+     */
+    protected abstract int defaultAssemblyRegionPadding();
+
+    /**
+     * @return Default value for the {@link #maxReadsPerAlignmentStart} parameter, if none is provided on the command line
+     */
+    protected abstract int defaultMaxReadsPerAlignmentStart();
+
+    /**
+     * @return Default value for the {@link #activeProbThreshold} parameter, if none is provided on the command line
+     */
+    protected abstract double defaultActiveProbThreshold();
+
+    /**
+     * @return Default value for the {@link #maxProbPropagationDistance} parameter, if none is provided on the command line
+     */
+    protected abstract int defaultMaxProbPropagationDistance();
+
+    @Argument(doc = "whether to use the shuffle implementation or not", shortName = "shuffle", fullName = "shuffle", optional = true)
+    public boolean shuffle = false;
+
+    @Override
+    public final boolean requiresReads() { return true; }
+
+    @Override
+    public final boolean requiresReference() { return true; }
+
+    public List<ReadFilter> getDefaultReadFilters() {
+        final List<ReadFilter> defaultFilters = new ArrayList<>(2);
+        defaultFilters.add(new WellformedReadFilter());
+        defaultFilters.add(new ReadFilterLibrary.MappedReadFilter());
+        return defaultFilters;
+    }
+
+    /**
+     * @return The evaluator to be used to determine whether each locus is active or not. Must be implemented by tool authors.
+     *         The results of this per-locus evaluator are used to determine the bounds of each active and inactive region.
+     */
+    public abstract AssemblyRegionEvaluator assemblyRegionEvaluator();
+
+    private List<ShardBoundary> intervalShards;
+
+    @Override
+    protected List<SimpleInterval> editIntervals(List<SimpleInterval> rawIntervals) {
+        SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
+        List<SimpleInterval> intervals = rawIntervals == null ? IntervalUtils.getAllIntervalsForReference(sequenceDictionary) : rawIntervals;
+        intervalShards = intervals.stream()
+                .flatMap(interval -> Shard.divideIntervalIntoShards(interval, readShardSize, readShardPadding, sequenceDictionary).stream())
+                .collect(Collectors.toList());
+        List<SimpleInterval> paddedIntervalsForReads =
+                intervals.stream().map(interval -> interval.expandWithinContig(readShardPadding, sequenceDictionary)).collect(Collectors.toList());
+        return paddedIntervalsForReads;
+    }
+
+    /**
+     * Loads assembly regions and the corresponding reference and features into a {@link JavaRDD} for the intervals specified.
+     *
+     * If no intervals were specified, returns all the assembly regions.
+     *
+     * @return all assembly regions as a {@link JavaRDD}, bounded by intervals if specified.
+     */
+    public JavaRDD<Tuple3<AssemblyRegion, ReferenceContext, FeatureContext>> getAssemblyRegions(JavaSparkContext ctx) {
+        SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
+        JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShards, readShardSize, shuffle);
+        Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
+        Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
+        return shardedReads.flatMap(getAssemblyRegionsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, getHeaderForReads(),
+                assemblyRegionEvaluator(), minAssemblyRegionSize, maxAssemblyRegionSize, assemblyRegionPadding, activeProbThreshold, maxProbPropagationDistance));
+    }
+
+    private static FlatMapFunction<Shard<GATKRead>, Tuple3<AssemblyRegion, ReferenceContext, FeatureContext>> getAssemblyRegionsFunction(
+            final Broadcast<ReferenceMultiSource> bReferenceSource,
+            final Broadcast<FeatureManager> bFeatureManager,
+            final SAMSequenceDictionary sequenceDictionary,
+            final SAMFileHeader header,
+            final AssemblyRegionEvaluator evaluator,
+            final int minAssemblyRegionSize,
+            final int maxAssemblyRegionSize,
+            final int assemblyRegionPadding,
+            final double activeProbThreshold,
+            final int maxProbPropagationDistance) {
+        return (FlatMapFunction<Shard<GATKRead>, Tuple3<AssemblyRegion, ReferenceContext, FeatureContext>>) shardedRead -> {
+            SimpleInterval paddedInterval = shardedRead.getPaddedInterval();
+            SimpleInterval assemblyRegionPaddedInterval = paddedInterval.expandWithinContig(assemblyRegionPadding, sequenceDictionary);
+
+            ReferenceDataSource reference = bReferenceSource == null ? null :
+                    new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, assemblyRegionPaddedInterval), sequenceDictionary);
+            FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
+            ReferenceContext referenceContext = new ReferenceContext(reference, paddedInterval);
+            FeatureContext featureContext = new FeatureContext(features, paddedInterval);
+
+            final Iterable<AssemblyRegion> assemblyRegions = AssemblyRegion.createFromReadShard(shardedRead,
+                    header, referenceContext, featureContext, evaluator,
+                    minAssemblyRegionSize, maxAssemblyRegionSize, assemblyRegionPadding, activeProbThreshold,
+                    maxProbPropagationDistance);
+            return Iterables.transform(assemblyRegions, new Function<AssemblyRegion, Tuple3<AssemblyRegion, ReferenceContext, FeatureContext>>() {
+                @Nullable
+                @Override
+                public Tuple3<AssemblyRegion, ReferenceContext, FeatureContext> apply(@Nullable AssemblyRegion assemblyRegion) {
+                    return new Tuple3<>(assemblyRegion,
+                            new ReferenceContext(reference, assemblyRegion.getExtendedSpan()),
+                            new FeatureContext(features, assemblyRegion.getExtendedSpan()));
+                }
+            });
+        };
+    }
+
+}
@@ -2,6 +2,8 @@
 
 import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKCommandLinePluginDescriptor;
 import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
+import org.broadinstitute.hellbender.engine.FeatureDataSource;
+import org.broadinstitute.hellbender.engine.FeatureManager;
 import org.broadinstitute.hellbender.utils.SerializableFunction;
 import com.google.cloud.genomics.dataflow.utils.GCSOptions;
 import htsjdk.samtools.SAMFileHeader;
@@ -92,6 +94,7 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram {
     private ReferenceMultiSource referenceSource;
     private SAMSequenceDictionary referenceDictionary;
     private List<SimpleInterval> intervals;
+    protected FeatureManager features;
 
     /**
      * Return the list of GATKCommandLinePluginDescriptor objects to be used for this CLP.
@@ -354,6 +357,7 @@ protected void runPipeline( JavaSparkContext sparkContext ) {
     private void initializeToolInputs(final JavaSparkContext sparkContext) {
         initializeReference();
         initializeReads(sparkContext); // reference must be initialized before reads
+        initializeFeatures();
         initializeIntervals();
     }
 
@@ -393,6 +397,21 @@ private void initializeReference() {
         }
     }
 
+    /**
+     * Initialize our source of Feature data (or set it to null if no Feature argument(s) were provided).
+     *
+     * Package-private so that engine classes can access it, but concrete tool child classes cannot.
+     * May be overridden by traversals that require custom initialization of Feature data sources.
+     *
+     * By default, this method initializes the FeatureManager to use the lookahead cache of {@link FeatureDataSource#DEFAULT_QUERY_LOOKAHEAD_BASES} bases.
+     */
+    void initializeFeatures() {
+        features = new FeatureManager(this);
+        if ( features.isEmpty() ) {  // No available sources of Features discovered for this tool
+            features = null;
+        }
+    }
+
     /**
      * Loads our intervals using the best available sequence dictionary (as returned by {@link #getBestAvailableSequenceDictionary})
      * to parse/verify them. Does nothing if no intervals were specified.
 
@@ -0,0 +1,78 @@
+package org.broadinstitute.hellbender.engine.spark;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.broadcast.Broadcast;
+import org.broadinstitute.hellbender.cmdline.Argument;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import scala.Tuple4;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * A Spark version of {@link IntervalWalker}.
+ */
+public abstract class IntervalWalkerSpark extends GATKSparkTool {
+    private static final long serialVersionUID = 1L;
+
+    @Override
+    public boolean requiresIntervals() {
+        return true;
+    }
+
+    @Argument(doc = "whether to use the shuffle implementation or not", shortName = "shuffle", fullName = "shuffle", optional = true)
+    public boolean shuffle = false;
+
+    @Argument(fullName="intervalShardPadding", shortName="intervalShardPadding", doc = "Each interval shard has this many bases of extra context on each side.", optional = true)
+    public int intervalShardPadding = 1000;
+
+    /**
+     * Customize initialization of the Feature data source for this traversal type to disable query lookahead.
+     */
+    void initializeFeatures() {
+        // Disable query lookahead in our FeatureManager for this traversal type. Query lookahead helps
+        // when our query intervals are overlapping and gradually increasing in position (as they are
+        // with ReadWalkers, typically), but with IntervalWalkers our query intervals are guaranteed
+        // to be non-overlapping, since our interval parsing code always merges overlapping intervals.
+        features = new FeatureManager(this, 0);
+        if ( features.isEmpty() ) {  // No available sources of Features for this tool
+            features = null;
+        }
+    }
+
+    /**
+     * Loads intervals and the corresponding reads, reference and features into a {@link JavaRDD}.
+     *
+     * @return all intervals as a {@link JavaRDD}.
+     */
+    public JavaRDD<Tuple4<SimpleInterval, ReadsContext, ReferenceContext, FeatureContext>> getIntervals(JavaSparkContext ctx) {
+        SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
+        // don't shard the intervals themselves, since we want each interval to be processed by a single task
+        final List<ShardBoundary> intervalShardBoundaries = getIntervals().stream()
+                .map(i -> new ShardBoundary(i, i)).collect(Collectors.toList());
+        JavaRDD<Shard<GATKRead>> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShardBoundaries, Integer.MAX_VALUE, shuffle);
+        Broadcast<ReferenceMultiSource> bReferenceSource = hasReference() ? ctx.broadcast(getReference()) : null;
+        Broadcast<FeatureManager> bFeatureManager = features == null ? null : ctx.broadcast(features);
+        return shardedReads.map(getIntervalsFunction(bReferenceSource, bFeatureManager, sequenceDictionary, intervalShardPadding));
+    }
+
+    private static org.apache.spark.api.java.function.Function<Shard<GATKRead>, Tuple4<SimpleInterval, ReadsContext, ReferenceContext, FeatureContext>> getIntervalsFunction(
+            Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager,
+            SAMSequenceDictionary sequenceDictionary, int intervalShardPadding) {
+        return (org.apache.spark.api.java.function.Function<Shard<GATKRead>, Tuple4<SimpleInterval, ReadsContext, ReferenceContext, FeatureContext>>) shard -> {
+            // get reference bases for this shard (padded)
+            SimpleInterval interval = shard.getInterval();
+            SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(intervalShardPadding, sequenceDictionary);
+            ReadsContext readsContext = new ReadsContext(shard);
+            ReferenceDataSource reference = bReferenceSource == null ? null :
+                    new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
+            FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
+            return new Tuple4<>(interval, readsContext, new ReferenceContext(reference, interval), new FeatureContext(features, interval));
+        };
+    }
+}