spark git commit: [MINOR][DOCS] Remove consecutive duplicated words/typo in Spark Repo

srowen Wed, 04 Jan 2017 07:08:45 -0800

Repository: spark
Updated Branches:
  refs/heads/master 7a8250581 -> a1e40b1f5



[MINOR][DOCS] Remove consecutive duplicated words/typo in Spark Repo

## What changes were proposed in this pull request?
There are many locations in the Spark repo where the same word occurs 
consecutively. Sometimes they are appropriately placed, but many times they are 
not. This PR removes the inappropriately duplicated words.

## How was this patch tested?
N/A since only docs or comments were updated.

Author: Niranjan Padmanabhan <[email protected]>

Closes #16455 from neurons/np.structure_streaming_doc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a1e40b1f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a1e40b1f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a1e40b1f

Branch: refs/heads/master
Commit: a1e40b1f5d651305bbd0ba05779263a44f607498
Parents: 7a82505
Author: Niranjan Padmanabhan <[email protected]>
Authored: Wed Jan 4 15:07:29 2017 +0000
Committer: Sean Owen <[email protected]>
Committed: Wed Jan 4 15:07:29 2017 +0000

----------------------------------------------------------------------
 .../spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java  | 2 +-
 .../spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java  | 2 +-
 core/src/main/scala/org/apache/spark/MapOutputTracker.scala      | 2 +-
 core/src/main/scala/org/apache/spark/TaskEndReason.scala         | 2 +-
 .../main/scala/org/apache/spark/deploy/FaultToleranceTest.scala  | 2 +-
 .../scala/org/apache/spark/deploy/history/HistoryServer.scala    | 2 +-
 .../scala/org/apache/spark/executor/ShuffleReadMetrics.scala     | 2 +-
 .../apache/spark/network/netty/NettyBlockTransferService.scala   | 2 +-
 core/src/main/scala/org/apache/spark/util/SizeEstimator.scala    | 2 +-
 .../org/apache/spark/deploy/history/ApplicationCacheSuite.scala  | 2 +-
 .../scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala     | 4 ++--
 .../spark/util/collection/ExternalAppendOnlyMapSuite.scala       | 2 +-
 docs/ml-features.md                                              | 2 +-
 docs/mllib-statistics.md                                         | 2 +-
 docs/structured-streaming-kafka-integration.md                   | 2 +-
 docs/structured-streaming-programming-guide.md                   | 2 +-
 .../main/java/org/apache/spark/examples/ml/JavaLDAExample.java   | 2 +-
 examples/src/main/python/ml/lda_example.py                       | 2 +-
 .../src/main/scala/org/apache/spark/examples/ml/LDAExample.scala | 2 +-
 .../scala/org/apache/spark/streaming/flume/sink/SparkSink.scala  | 2 +-
 .../org/apache/spark/sql/kafka010/KafkaSourceProvider.scala      | 2 +-
 .../spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala     | 2 +-
 mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala    | 2 +-
 .../org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala | 2 +-
 .../main/scala/org/apache/spark/mllib/clustering/LDAModel.scala  | 2 +-
 .../apache/spark/mllib/regression/StreamingLinearAlgorithm.scala | 2 +-
 python/pyspark/ml/clustering.py                                  | 2 +-
 python/pyspark/ml/linalg/__init__.py                             | 4 ++--
 python/pyspark/sql/utils.py                                      | 2 +-
 .../yarn/LocalityPreferredContainerPlacementStrategy.scala       | 2 +-
 .../org/apache/spark/sql/catalyst/expressions/UnsafeRow.java     | 2 +-
 .../spark/sql/catalyst/expressions/windowExpressions.scala       | 2 +-
 .../spark/sql/catalyst/plans/logical/basicLogicalOperators.scala | 2 +-
 .../scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 4 ++--
 .../spark/sql/catalyst/expressions/AttributeSetSuite.scala       | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala       | 2 +-
 .../spark/sql/execution/datasources/PartitioningUtils.scala      | 2 +-
 .../scala/org/apache/spark/sql/streaming/DataStreamWriter.scala  | 2 +-
 .../scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala | 4 ++--
 .../test/scala/org/apache/spark/sql/execution/PlannerSuite.scala | 2 +-
 .../org/apache/spark/sql/streaming/FileStreamSinkSuite.scala     | 2 +-
 .../src/main/java/org/apache/hive/service/AbstractService.java   | 2 +-
 .../src/main/java/org/apache/hive/service/ServiceOperations.java | 2 +-
 .../java/org/apache/hive/service/ServiceStateChangeListener.java | 2 +-
 .../main/java/org/apache/hive/service/cli/TypeDescriptor.java    | 2 +-
 .../apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala | 2 +-
 .../org/apache/spark/sql/hive/execution/HiveTableScanExec.scala  | 2 +-
 streaming/src/main/scala/org/apache/spark/streaming/State.scala  | 2 +-
 .../scala/org/apache/spark/streaming/dstream/InputDStream.scala  | 2 +-
 .../org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala    | 2 +-
 .../spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala   | 2 +-
 .../apache/spark/streaming/receiver/BlockGeneratorSuite.scala    | 4 ++--
 52 files changed, 57 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
 
b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 5b42843..f219c56 100644
--- 
a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ 
b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -86,7 +86,7 @@ public final class UnsafeInMemorySorter {
   private final PrefixComparators.RadixSortSupport radixSortSupport;
 
   /**
-   * Within this buffer, position {@code 2 * i} holds a pointer pointer to the 
record at
+   * Within this buffer, position {@code 2 * i} holds a pointer to the record 
at
    * index {@code i}, while position {@code 2 * i + 1} in the array holds an 
8-byte key prefix.
    *
    * Only part of the array will be used to store the pointers, the rest part 
is preserved as

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
 
b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
index 430bf67..d9f84d1 100644
--- 
a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
+++ 
b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
@@ -25,7 +25,7 @@ import org.apache.spark.util.collection.SortDataFormat;
  * Supports sorting an array of (record pointer, key prefix) pairs.
  * Used in {@link UnsafeInMemorySorter}.
  * <p>
- * Within each long[] buffer, position {@code 2 * i} holds a pointer pointer 
to the record at
+ * Within each long[] buffer, position {@code 2 * i} holds a pointer to the 
record at
  * index {@code i}, while position {@code 2 * i + 1} in the array holds an 
8-byte key prefix.
  */
 public final class UnsafeSortDataFormat

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala 
b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 6f5c31d..4ca442b 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -317,7 +317,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf,
     pool
   }
 
-  // Make sure that that we aren't going to exceed the max RPC message size by 
making sure
+  // Make sure that we aren't going to exceed the max RPC message size by 
making sure
   // we use broadcast to send large map output statuses.
   if (minSizeForBroadcast > maxRpcMessageSize) {
     val msg = s"spark.shuffle.mapOutput.minSizeForBroadcast 
($minSizeForBroadcast bytes) must " +

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/scala/org/apache/spark/TaskEndReason.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala 
b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 7745387..8c1b5f7 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -98,7 +98,7 @@ case class FetchFailed(
    * 4 task failures, instead we immediately go back to the stage which 
generated the map output,
    * and regenerate the missing data.  (2) we don't count fetch failures for 
blacklisting, since
    * presumably its not the fault of the executor where the task ran, but the 
executor which
-   * stored the data. This is especially important because we we might rack up 
a bunch of
+   * stored the data. This is especially important because we might rack up a 
bunch of
    * fetch-failures in rapid succession, on all nodes of the cluster, due to 
one bad node.
    */
   override def countTowardsTaskFailures: Boolean = false

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala 
b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
index 79f4d06..320af5c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala
@@ -43,7 +43,7 @@ import org.apache.spark.util.{ThreadUtils, Utils}
  * Execute using
  * ./bin/spark-class org.apache.spark.deploy.FaultToleranceTest
  *
- * Make sure that that the environment includes the following properties in 
SPARK_DAEMON_JAVA_OPTS
+ * Make sure that the environment includes the following properties in 
SPARK_DAEMON_JAVA_OPTS
  * *and* SPARK_JAVA_OPTS:
  *   - spark.deploy.recoveryMode=ZOOKEEPER
  *   - spark.deploy.zookeeper.url=172.17.42.1:2181

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala 
b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 2b00a4a..54f39f7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -291,7 +291,7 @@ object HistoryServer extends Logging {
 
   /**
    * Create a security manager.
-   * This turns off security in the SecurityManager, so that the the History 
Server can start
+   * This turns off security in the SecurityManager, so that the History 
Server can start
    * in a Spark cluster where security is enabled.
    * @param config configuration for the SecurityManager constructor
    * @return the security manager for use in constructing the History Server.

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala 
b/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala
index f7a9917..8dd1a1e 100644
--- a/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala
@@ -92,7 +92,7 @@ class ShuffleReadMetrics private[spark] () extends 
Serializable {
   private[spark] def setRecordsRead(v: Long): Unit = _recordsRead.setValue(v)
 
   /**
-   * Resets the value of the current metrics (`this`) and and merges all the 
independent
+   * Resets the value of the current metrics (`this`) and merges all the 
independent
    * [[TempShuffleReadMetrics]] into `this`.
    */
   private[spark] def setMergeValues(metrics: Seq[TempShuffleReadMetrics]): 
Unit = {

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
 
b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index dc70eb8..3d4ea3c 100644
--- 
a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ 
b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -37,7 +37,7 @@ import org.apache.spark.storage.{BlockId, StorageLevel}
 import org.apache.spark.util.Utils
 
 /**
- * A BlockTransferService that uses Netty to fetch a set of blocks at at time.
+ * A BlockTransferService that uses Netty to fetch a set of blocks at time.
  */
 private[spark] class NettyBlockTransferService(
     conf: SparkConf,

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala 
b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index 386fdfd..3bfdf95 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -350,7 +350,7 @@ object SizeEstimator extends Logging {
     // 3. consistent fields layouts throughout the hierarchy: This means we 
should layout
     // superclass first. And we can use superclass's shellSize as a starting 
point to layout the
     // other fields in this class.
-    // 4. class alignment: HotSpot rounds field blocks up to to HeapOopSize 
not 4 bytes, confirmed
+    // 4. class alignment: HotSpot rounds field blocks up to HeapOopSize not 4 
bytes, confirmed
     // with Aleksey. see https://bugs.openjdk.java.net/browse/CODETOOLS-7901322
     //
     // The real world field layout is much more complicated. There are three 
kinds of fields

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
----------------------------------------------------------------------
diff --git 
a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
 
b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
index e3304be..7998e37 100644
--- 
a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
@@ -253,7 +253,7 @@ class ApplicationCacheSuite extends SparkFunSuite with 
Logging with MockitoSugar
     assertNotFound(appId, None)
   }
 
-  test("Test that if an attempt ID is is set, it must be used in lookups") {
+  test("Test that if an attempt ID is set, it must be used in lookups") {
     val operations = new StubCacheOperations()
     val clock = new ManualClock(1)
     implicit val cache = new ApplicationCache(operations, retainedApplications 
= 10, clock = clock)

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
----------------------------------------------------------------------
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala 
b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 5e8a854..f3d3f70 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -1819,7 +1819,7 @@ class DAGSchedulerSuite extends SparkFunSuite with 
LocalSparkContext with Timeou
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 
0).map(_._1).toSet ===
       HashSet(makeBlockManagerId("hostA")))
 
-    // Reducer should run where RDD 2 has preferences, even though though it 
also has a shuffle dep
+    // Reducer should run where RDD 2 has preferences, even though it also has 
a shuffle dep
     val reduceTaskSet = taskSets(1)
     assertLocations(reduceTaskSet, Seq(Seq("hostB")))
     complete(reduceTaskSet, Seq((Success, 42)))
@@ -2058,7 +2058,7 @@ class DAGSchedulerSuite extends SparkFunSuite with 
LocalSparkContext with Timeou
 
     // Now complete tasks in the second task set
     val newTaskSet = taskSets(1)
-    assert(newTaskSet.tasks.size === 2)     // Both tasks 0 and 1 were on on 
hostA
+    assert(newTaskSet.tasks.size === 2)     // Both tasks 0 and 1 were on hostA
     runEvent(makeCompletionEvent(newTaskSet.tasks(0), Success, 
makeMapStatus("hostB", 2)))
     assert(results.size === 0)    // Map stage job should not be complete yet
     runEvent(makeCompletionEvent(newTaskSet.tasks(1), Success, 
makeMapStatus("hostB", 2)))

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
----------------------------------------------------------------------
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
 
b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 7f08382..c8b6a33 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -53,7 +53,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with 
LocalSparkContext {
     conf
   }
 
-  test("single insert insert") {
+  test("single insert") {
     val conf = createSparkConf(loadDefaults = false)
     sc = new SparkContext("local", "test", conf)
     val map = createExternalMap[Int]

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/docs/ml-features.md
----------------------------------------------------------------------
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 1d34497..d67fce3 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -752,7 +752,7 @@ for more details on the API.
 
 `Interaction` is a `Transformer` which takes vector or double-valued columns, 
and generates a single vector column that contains the product of all 
combinations of one value from each input column.
 
-For example, if you have 2 vector type columns each of which has 3 dimensions 
as input columns, then then you'll get a 9-dimensional vector as the output 
column.
+For example, if you have 2 vector type columns each of which has 3 dimensions 
as input columns, then you'll get a 9-dimensional vector as the output column.
 
 **Examples**
 

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/docs/mllib-statistics.md
----------------------------------------------------------------------
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 12797bd..430c069 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -354,7 +354,7 @@ v = u.map(lambda x: 1.0 + 2.0 * x)
 useful for visualizing empirical probability distributions without requiring 
assumptions about the
 particular distribution that the observed samples are drawn from. It computes 
an estimate of the
 probability density function of a random variables, evaluated at a given set 
of points. It achieves
-this estimate by expressing the PDF of the empirical distribution at a 
particular point as the the
+this estimate by expressing the PDF of the empirical distribution at a 
particular point as the
 mean of PDFs of normal distributions centered around each of the samples.
 
 <div class="codetabs">

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/docs/structured-streaming-kafka-integration.md
----------------------------------------------------------------------
diff --git a/docs/structured-streaming-kafka-integration.md 
b/docs/structured-streaming-kafka-integration.md
index 2458bb5..9b82e8e 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -244,7 +244,7 @@ Note that the following Kafka params cannot be set and the 
Kafka source will thr
 - **group.id**: Kafka source will create a unique group id for each query 
automatically.
 - **auto.offset.reset**: Set the source option `startingOffsets` to specify
  where to start instead. Structured Streaming manages which offsets are 
consumed internally, rather 
- than rely on the kafka Consumer to do it. This will ensure that no data is 
missed when when new 
+ than rely on the kafka Consumer to do it. This will ensure that no data is 
missed when new 
  topics/partitions are dynamically subscribed. Note that `startingOffsets` 
only applies when a new
  Streaming query is started, and that resuming will always pick up from where 
the query left off.
 - **key.deserializer**: Keys are always deserialized as byte arrays with 
ByteArrayDeserializer. Use 

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/docs/structured-streaming-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/structured-streaming-programming-guide.md 
b/docs/structured-streaming-programming-guide.md
index 799f636..6cd050e 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -680,7 +680,7 @@ windowedCounts = words.groupBy(
 
 ### Handling Late Data and Watermarking
 Now consider what happens if one of the events arrives late to the application.
-For example, say, a word generated at 12:04 (i.e. event time) could be 
received received by 
+For example, say, a word generated at 12:04 (i.e. event time) could be 
received by 
 the application at 12:11. The application should use the time 12:04 instead of 
12:11
 to update the older counts for the window `12:00 - 12:10`. This occurs 
 naturally in our window-based grouping â Structured Streaming can maintain 
the intermediate state 

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
----------------------------------------------------------------------
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
index 9041244..0e5d005 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
@@ -52,7 +52,7 @@ public class JavaLDAExample {
     double ll = model.logLikelihood(dataset);
     double lp = model.logPerplexity(dataset);
     System.out.println("The lower bound on the log likelihood of the entire 
corpus: " + ll);
-    System.out.println("The upper bound bound on perplexity: " + lp);
+    System.out.println("The upper bound on perplexity: " + lp);
 
     // Describe topics.
     Dataset<Row> topics = model.describeTopics(3);

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/examples/src/main/python/ml/lda_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/lda_example.py 
b/examples/src/main/python/ml/lda_example.py
index 2dc1742..a8b346f 100644
--- a/examples/src/main/python/ml/lda_example.py
+++ b/examples/src/main/python/ml/lda_example.py
@@ -46,7 +46,7 @@ if __name__ == "__main__":
     ll = model.logLikelihood(dataset)
     lp = model.logPerplexity(dataset)
     print("The lower bound on the log likelihood of the entire corpus: " + 
str(ll))
-    print("The upper bound bound on perplexity: " + str(lp))
+    print("The upper bound on perplexity: " + str(lp))
 
     # Describe topics.
     topics = model.describeTopics(3)

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
index 22b3b0e..4215d37 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
@@ -50,7 +50,7 @@ object LDAExample {
     val ll = model.logLikelihood(dataset)
     val lp = model.logPerplexity(dataset)
     println(s"The lower bound on the log likelihood of the entire corpus: $ll")
-    println(s"The upper bound bound on perplexity: $lp")
+    println(s"The upper bound on perplexity: $lp")
 
     // Describe topics.
     val topics = model.describeTopics(3)

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala
----------------------------------------------------------------------
diff --git 
a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala
 
b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala
index 41f27e9..e5b63aa 100644
--- 
a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala
+++ 
b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala
@@ -45,7 +45,7 @@ import org.apache.flume.sink.AbstractSink
  * the thread itself is blocked and a reference to it saved off.
  *
  * When the ack for that batch is received,
- * the thread which created the transaction is is retrieved and it commits the 
transaction with the
+ * the thread which created the transaction is retrieved and it commits the 
transaction with the
  * channel from the same thread it was originally created in (since Flume 
transactions are
  * thread local). If a nack is received instead, the sink rolls back the 
transaction. If no ack
  * is received within the specified timeout, the transaction is rolled back 
too. If an ack comes

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
----------------------------------------------------------------------
diff --git 
a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
 
b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index aa01238..ff9965b 100644
--- 
a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ 
b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -212,7 +212,7 @@ private[kafka010] class KafkaSourceProvider extends 
StreamSourceProvider
            |Instead set the source option '$STARTING_OFFSETS_OPTION_KEY' to 
'earliest' or 'latest'
            |to specify where to start. Structured Streaming manages which 
offsets are consumed
            |internally, rather than relying on the kafkaConsumer to do it. 
This will ensure that no
-           |data is missed when when new topics/partitions are dynamically 
subscribed. Note that
+           |data is missed when new topics/partitions are dynamically 
subscribed. Note that
            |'$STARTING_OFFSETS_OPTION_KEY' only applies when a new Streaming 
query is started, and
            |that resuming will always pick up from where the query left off. 
See the docs for more
            |details.

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
----------------------------------------------------------------------
diff --git 
a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
 
b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
index a4d81a6..18a5a15 100644
--- 
a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
+++ 
b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
@@ -129,7 +129,7 @@ abstract class 
KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
 
   /**
    * Test the WriteAheadLogBackedRDD, by writing some partitions of the data 
to block manager
-   * and the rest to a write ahead log, and then reading reading it all back 
using the RDD.
+   * and the rest to a write ahead log, and then reading it all back using the 
RDD.
    * It can also test if the partitions that were read from the log were again 
stored in
    * block manager.
    *

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 583e5e0..728a883 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -512,7 +512,7 @@ abstract class LDAModel private[ml] (
   }
 
   /**
-   * Calculate an upper bound bound on perplexity.  (Lower is better.)
+   * Calculate an upper bound on perplexity.  (Lower is better.)
    * See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
    *
    * WARNING: If this model is an instance of [[DistributedLDAModel]] 
(produced when [[optimizer]]

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
index 5cbfbff..4d6520d 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
@@ -54,7 +54,7 @@ private[python] class Word2VecModelWrapper(model: 
Word2VecModel) {
   }
 
   /**
-   * Finds words similar to the the vector representation of a word without
+   * Finds words similar to the vector representation of a word without
    * filtering results.
    * @param vector a vector
    * @param num number of synonyms to find

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 25ffd85..933a5f1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -245,7 +245,7 @@ class LocalLDAModel private[spark] (
   }
 
   /**
-   * Calculate an upper bound bound on perplexity.  (Lower is better.)
+   * Calculate an upper bound on perplexity.  (Lower is better.)
    * See Equation (16) in original Online LDA paper.
    *
    * @param documents test corpus to use for calculating perplexity

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index 46deb54..f44c8fe 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -29,7 +29,7 @@ import org.apache.spark.streaming.dstream.DStream
 /**
  * :: DeveloperApi ::
  * StreamingLinearAlgorithm implements methods for continuously
- * training a generalized linear model model on streaming data,
+ * training a generalized linear model on streaming data,
  * and using it for prediction on (possibly different) streaming data.
  *
  * This class takes as type parameters a GeneralizedLinearModel,

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 35d0aef..54510e0 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -699,7 +699,7 @@ class LDAModel(JavaModel):
     @since("2.0.0")
     def logPerplexity(self, dataset):
         """
-        Calculate an upper bound bound on perplexity.  (Lower is better.)
+        Calculate an upper bound on perplexity.  (Lower is better.)
         See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
 
         WARNING: If this model is an instance of 
:py:class:`DistributedLDAModel` (produced when

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/python/pyspark/ml/linalg/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/linalg/__init__.py 
b/python/pyspark/ml/linalg/__init__.py
index 1705c15..b765343 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -481,7 +481,7 @@ class SparseVector(Vector):
         >>> SparseVector(4, {1:1.0, 6:2.0})
         Traceback (most recent call last):
         ...
-        AssertionError: Index 6 is out of the the size of vector with size=4
+        AssertionError: Index 6 is out of the size of vector with size=4
         >>> SparseVector(4, {-1:1.0})
         Traceback (most recent call last):
         ...
@@ -521,7 +521,7 @@ class SparseVector(Vector):
 
         if self.indices.size > 0:
             assert np.max(self.indices) < self.size, \
-                "Index %d is out of the the size of vector with size=%d" \
+                "Index %d is out of the size of vector with size=%d" \
                 % (np.max(self.indices), self.size)
             assert np.min(self.indices) >= 0, \
                 "Contains negative index %d" % (np.min(self.indices))

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/python/pyspark/sql/utils.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index 2a85ec0..7bc6a59 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -95,7 +95,7 @@ def install_exception_handler():
     original = py4j.protocol.get_return_value
     # The original `get_return_value` is not patched, it's idempotent.
     patched = capture_sql_exception(original)
-    # only patch the one used in in py4j.java_gateway (call Java API)
+    # only patch the one used in py4j.java_gateway (call Java API)
     py4j.java_gateway.get_return_value = patched
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
----------------------------------------------------------------------
diff --git 
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
 
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
index 8772e26..fb2d61f 100644
--- 
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
+++ 
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -32,7 +32,7 @@ private[yarn] case class ContainerLocalityPreferences(nodes: 
Array[String], rack
 
 /**
  * This strategy is calculating the optimal locality preferences of YARN 
containers by considering
- * the node ratio of pending tasks, number of required cores/containers and 
and locality of current
+ * the node ratio of pending tasks, number of required cores/containers and 
locality of current
  * existing and pending allocated containers. The target of this algorithm is 
to maximize the number
  * of tasks that would run locally.
  *

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index d205547..86de909 100644
--- 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -196,7 +196,7 @@ public final class UnsafeRow extends InternalRow implements 
Externalizable, Kryo
     assertIndexIsValid(i);
     BitSetMethods.set(baseObject, baseOffset, i);
     // To preserve row equality, zero out the value when setting the column to 
null.
-    // Since this row does does not currently support updates to 
variable-length values, we don't
+    // Since this row does not currently support updates to variable-length 
values, we don't
     // have to worry about zeroing out that data.
     Platform.putLong(baseObject, getFieldOffset(i), 0);
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 13115f4..07d294b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -516,7 +516,7 @@ case class CumeDist() extends RowNumberLike with 
SizeBasedWindowFunction {
  * into the number of buckets); both variables are based on the size of the 
current partition.
  * During the calculation process the function keeps track of the current row 
number, the current
  * bucket number, and the row number at which the bucket will change 
(bucketThreshold). When the
- * current row number reaches bucket threshold, the bucket value is increased 
by one and the the
+ * current row number reaches bucket threshold, the bucket value is increased 
by one and the
  * threshold is increased by the bucket size (plus one extra if the current 
bucket is padded).
  *
  * This documentation has been based upon similar documentation for the Hive 
and Presto projects.

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index d583fa3..c977e78 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -795,7 +795,7 @@ case object OneRowRelation extends LeafNode {
 
   /**
    * Computes [[Statistics]] for this plan. The default implementation assumes 
the output
-   * cardinality is the product of of all child plan's cardinality, i.e. 
applies in the case
+   * cardinality is the product of all child plan's cardinality, i.e. applies 
in the case
    * of cartesian joins.
    *
    * [[LeafNode]]s must override this.

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 235ca8d..a96a3b7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -142,7 +142,7 @@ object DateTimeUtils {
   }
 
   /**
-   * Returns the number of days since epoch from from java.sql.Date.
+   * Returns the number of days since epoch from java.sql.Date.
    */
   def fromJavaDate(date: Date): SQLDate = {
     millisToDays(date.getTime)
@@ -503,7 +503,7 @@ object DateTimeUtils {
   }
 
   /**
-   * Calculates the year and and the number of the day in the year for the 
given
+   * Calculates the year and the number of the day in the year for the given
    * number of days. The given days is the number of days since 1.1.1970.
    *
    * The calculation uses the fact that the period 1.1.2001 until 31.12.2400 is

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
index 97cfb5f..273f95f 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
@@ -52,7 +52,7 @@ class AttributeSetSuite extends SparkFunSuite {
     assert((aSet ++ bSet).contains(aLower) === true)
   }
 
-  test("extracts all references references") {
+  test("extracts all references ") {
     val addSet = AttributeSet(Add(aUpper, Alias(bUpper, "test")()):: Nil)
     assert(addSet.contains(aUpper))
     assert(addSet.contains(aLower))

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index c1cedd8..2a06f3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -361,7 +361,7 @@ class Dataset[T] private[sql](
    * method used to map columns depend on the type of `U`:
    *  - When `U` is a class, fields for the class will be mapped to columns of 
the same name
    *    (case sensitivity is determined by `spark.sql.caseSensitive`).
-   *  - When `U` is a tuple, the columns will be be mapped by ordinal (i.e. 
the first column will
+   *  - When `U` is a tuple, the columns will be mapped by ordinal (i.e. the 
first column will
    *    be assigned to `_1`).
    *  - When `U` is a primitive type (i.e. String, Int, etc), then the first 
column of the
    *    `DataFrame` will be used.

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index bc29070..bad5996 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -41,7 +41,7 @@ object PartitionPath {
 }
 
 /**
- * Holds a directory in a partitioned collection of files as well as as the 
partition values
+ * Holds a directory in a partitioned collection of files as well as the 
partition values
  * in the form of a Row.  Before scanning, the files at `path` need to be 
enumerated.
  */
 case class PartitionPath(values: InternalRow, path: Path)

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index 0ce47b1..0b39965 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -285,7 +285,7 @@ final class DataStreamWriter[T] private[sql](ds: 
Dataset[T]) {
 
   /**
    * Starts the execution of the streaming query, which will continually send 
results to the given
-   * `ForeachWriter` as as new data arrives. The `ForeachWriter` can be used 
to send the data
+   * `ForeachWriter` as new data arrives. The `ForeachWriter` can be used to 
send the data
    * generated by the `DataFrame`/`Dataset` to an external system.
    *
    * Scala example:

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
index 4296ec5..22d5c47 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
@@ -257,7 +257,7 @@ class DataFrameTimeWindowingSuite extends QueryTest with 
SharedSQLContext with B
     }
   }
 
-  test("time window in SQL with with two expressions") {
+  test("time window in SQL with two expressions") {
     withTempTable { table =>
       checkAnswer(
         spark.sql(
@@ -272,7 +272,7 @@ class DataFrameTimeWindowingSuite extends QueryTest with 
SharedSQLContext with B
     }
   }
 
-  test("time window in SQL with with three expressions") {
+  test("time window in SQL with three expressions") {
     withTempTable { table =>
       checkAnswer(
         spark.sql(

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 375da22..0bfc92f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -363,7 +363,7 @@ class PlannerSuite extends SharedSQLContext {
   // This is a regression test for SPARK-9703
   test("EnsureRequirements should not repartition if only ordering requirement 
is unsatisfied") {
     // Consider an operator that imposes both output distribution and  
ordering requirements on its
-    // children, such as sort sort merge join. If the distribution 
requirements are satisfied but
+    // children, such as sort merge join. If the distribution requirements are 
satisfied but
     // the output ordering requirements are unsatisfied, then the planner 
should only add sorts and
     // should not need to add additional shuffles / exchanges.
     val outputOrdering = Seq(SortOrder(Literal(1), Ascending))

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 22f59f6..f67444f 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -144,7 +144,7 @@ class FileStreamSinkSuite extends StreamTest {
   }
 
   // This tests whether FileStreamSink works with aggregations. Specifically, 
it tests
-  // whether the the correct streaming QueryExecution (i.e. 
IncrementalExecution) is used to
+  // whether the correct streaming QueryExecution (i.e. IncrementalExecution) 
is used to
   // to execute the trigger for writing data to file sink. See SPARK-18440 for 
more details.
   test("writing with aggregation") {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java
----------------------------------------------------------------------
diff --git 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java
 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java
index c2a2b2d..9dd0efc 100644
--- 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java
+++ 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java
@@ -151,7 +151,7 @@ public abstract class AbstractService implements Service {
   }
 
   /**
-   * Verify that that a service is in a given state.
+   * Verify that a service is in a given state.
    *
    * @param currentState
    *          the desired state

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
----------------------------------------------------------------------
diff --git 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
index 8946219..a2c580d 100644
--- 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
+++ 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
@@ -33,7 +33,7 @@ public final class ServiceOperations {
   }
 
   /**
-   * Verify that that a service is in a given state.
+   * Verify that a service is in a given state.
    * @param state the actual state a service is in
    * @param expectedState the desired state
    * @throws IllegalStateException if the service state is different from

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java
----------------------------------------------------------------------
diff --git 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java
 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java
index d1aadad..a1ff10d 100644
--- 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java
+++ 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java
@@ -29,7 +29,7 @@ public interface ServiceStateChangeListener {
    * have changed state before this callback is invoked.
    *
    * This operation is invoked on the thread that initiated the state change,
-   * while the service itself in in a synchronized section.
+   * while the service itself in a synchronized section.
    * <ol>
    *   <li>Any long-lived operation here will prevent the service state
    *   change from completing in a timely manner.</li>

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java
----------------------------------------------------------------------
diff --git 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java
 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java
index 562b3f5..b80fd67 100644
--- 
a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java
+++ 
b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java
@@ -98,7 +98,7 @@ public class TypeDescriptor {
    * For datetime types this is the length in characters of the String 
representation
    * (assuming the maximum allowed precision of the fractional seconds 
component).
    * For binary data this is the length in bytes.
-   * Null is returned for for data types where the column size is not 
applicable.
+   * Null is returned for data types where the column size is not applicable.
    */
   public Integer getColumnSize() {
     if (type.isNumericType()) {

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
 
b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 5cd4935..d217e9b 100644
--- 
a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ 
b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -178,7 +178,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with 
BeforeAndAfter {
     "skewjoin",
     "database",
 
-    // These tests fail and and exit the JVM.
+    // These tests fail and exit the JVM.
     "auto_join18_multi_distinct",
     "join18_multi_distinct",
     "input44",

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index c80695b..7ee5fc5 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -40,7 +40,7 @@ import org.apache.spark.util.Utils
  * The Hive table scan operator.  Column and partition pruning are both 
handled.
  *
  * @param requestedAttributes Attributes to be fetched from the Hive table.
- * @param relation The Hive table be be scanned.
+ * @param relation The Hive table be scanned.
  * @param partitionPruningPred An optional partition pruning predicate for 
partitioned table.
  */
 private[hive]

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/streaming/src/main/scala/org/apache/spark/streaming/State.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/State.scala 
b/streaming/src/main/scala/org/apache/spark/streaming/State.scala
index 3f560f8..23cf48e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/State.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/State.scala
@@ -178,7 +178,7 @@ private[streaming] class StateImpl[S] extends State[S] {
     removed
   }
 
-  /** Whether the state has been been updated */
+  /** Whether the state has been updated */
   def isUpdated(): Boolean = {
     updated
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
----------------------------------------------------------------------
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
index a3c125c..9a760e2 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
@@ -88,7 +88,7 @@ abstract class InputDStream[T: ClassTag](_ssc: 
StreamingContext)
     if (!super.isTimeValid(time)) {
       false // Time not valid
     } else {
-      // Time is valid, but check it it is more than lastValidTime
+      // Time is valid, but check it is more than lastValidTime
       if (lastValidTime != null && time < lastValidTime) {
         logWarning(s"isTimeValid called with $time whereas the last valid time 
" +
           s"is $lastValidTime")

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
----------------------------------------------------------------------
diff --git 
a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
 
b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
index e8c814b..9b6bc71 100644
--- 
a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
+++ 
b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
@@ -326,7 +326,7 @@ class MapWithStateRDDSuite extends SparkFunSuite with 
RDDCheckpointTester with B
       // Create a MapWithStateRDD that has a long lineage using the data RDD 
with a long lineage
       val stateRDDWithLongLineage = 
makeStateRDDWithLongLineageDataRDD(longLineageRDD)
 
-      // Create a new MapWithStateRDD, with the lineage lineage 
MapWithStateRDD as the parent
+      // Create a new MapWithStateRDD, with the lineage MapWithStateRDD as the 
parent
       new MapWithStateRDD[Int, Int, Int, Int](
         stateRDDWithLongLineage,
         stateRDDWithLongLineage.sparkContext.emptyRDD[(Int, 
Int)].partitionBy(partitioner),

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
----------------------------------------------------------------------
diff --git 
a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
 
b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
index a37fac8..c5e695a 100644
--- 
a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
+++ 
b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
@@ -108,7 +108,7 @@ class WriteAheadLogBackedBlockRDDSuite
 
   /**
    * Test the WriteAheadLogBackedRDD, by writing some partitions of the data 
to block manager
-   * and the rest to a write ahead log, and then reading reading it all back 
using the RDD.
+   * and the rest to a write ahead log, and then reading it all back using the 
RDD.
    * It can also test if the partitions that were read from the log were again 
stored in
    * block manager.
    *

http://git-wip-us.apache.org/repos/asf/spark/blob/a1e40b1f/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
----------------------------------------------------------------------
diff --git 
a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
 
b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
index a1d0561..b70383e 100644
--- 
a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
+++ 
b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
@@ -90,7 +90,7 @@ class BlockGeneratorSuite extends SparkFunSuite with 
BeforeAndAfter {
     listener.pushedData.asScala.toSeq should contain theSameElementsInOrderAs 
(data1)
     assert(listener.onAddDataCalled === false) // should be called only with 
addDataWithCallback()
 
-    // Verify addDataWithCallback() add data+metadata and and callbacks are 
called correctly
+    // Verify addDataWithCallback() add data+metadata and callbacks are called 
correctly
     val data2 = 11 to 20
     val metadata2 = data2.map { _.toString }
     data2.zip(metadata2).foreach { case (d, m) => 
blockGenerator.addDataWithCallback(d, m) }
@@ -103,7 +103,7 @@ class BlockGeneratorSuite extends SparkFunSuite with 
BeforeAndAfter {
       listener.pushedData.asScala.toSeq should contain 
theSameElementsInOrderAs combined
     }
 
-    // Verify addMultipleDataWithCallback() add data+metadata and and 
callbacks are called correctly
+    // Verify addMultipleDataWithCallback() add data+metadata and callbacks 
are called correctly
     val data3 = 21 to 30
     val metadata3 = "metadata"
     blockGenerator.addMultipleDataWithCallback(data3.iterator, metadata3)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [MINOR][DOCS] Remove consecutive duplicated words/typo in Spark Repo

Reply via email to