CRUNCH-616: Replace (possibly copyrighted) Maugham text with Dickens. Contributed by Sean Owen.
Remove non-applicable Project Gutenberg license. Adjust lots of tests to match new text. Project: http://git-wip-us.apache.org/repos/asf/crunch/repo Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/5d237b36 Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/5d237b36 Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/5d237b36 Branch: refs/heads/master Commit: 5d237b36609484d49c30fa92fdf9613b6eee9d91 Parents: f1d074c Author: Tom White <[email protected]> Authored: Thu Sep 8 14:12:30 2016 +0100 Committer: Tom White <[email protected]> Committed: Thu Sep 8 14:12:30 2016 +0100 ---------------------------------------------------------------------- LICENSE | 298 - .../it/java/org/apache/crunch/CleanTextIT.java | 2 +- .../org/apache/crunch/CollectionPObjectIT.java | 4 +- .../org/apache/crunch/CollectionsLengthIT.java | 4 +- .../apache/crunch/DeepCopyCustomTuplesIT.java | 2 +- .../apache/crunch/FirstElementPObjectIT.java | 2 +- .../it/java/org/apache/crunch/PObjectsIT.java | 2 +- .../org/apache/crunch/PipelineCallableIT.java | 2 +- .../it/java/org/apache/crunch/RecordDropIT.java | 2 +- .../apache/crunch/StageResultsCountersIT.java | 2 +- .../it/java/org/apache/crunch/WordCountIT.java | 8 +- .../apache/crunch/impl/mr/plan/DotfilesIT.java | 4 +- .../it/java/org/apache/crunch/lib/MapredIT.java | 4 +- .../java/org/apache/crunch/lib/MapreduceIT.java | 2 +- .../lib/join/AbstractFullOuterJoinIT.java | 4 +- .../crunch/lib/join/AbstractInnerJoinIT.java | 4 +- .../lib/join/AbstractLeftOuterJoinIT.java | 4 +- .../lib/join/AbstractRightOuterJoinIT.java | 4 +- .../org/apache/crunch/lib/join/JoinTester.java | 6 +- .../apache/crunch/io/hbase/HFileTargetIT.java | 16 +- .../scrunch/AggregatorsIntegrationTest.scala | 2 +- .../org/apache/crunch/scrunch/CogroupTest.scala | 6 +- .../apache/crunch/scrunch/IncrementTest.scala | 8 +- .../org/apache/crunch/scrunch/JoinTest.scala | 12 +- .../apache/crunch/scrunch/PCollectionTest.scala | 6 +- .../apache/crunch/scrunch/PipelineAppTest.scala | 2 +- .../org/apache/crunch/scrunch/TopTest.scala | 2 +- .../org/apache/crunch/scrunch/UnionTest.scala | 12 +- .../apache/crunch/scrunch/WordCountTest.scala | 2 +- .../org/apache/crunch/SparkHFileTargetIT.java | 16 +- .../apache/crunch/SparkPipelineCallableIT.java | 2 +- crunch-test/src/main/resources/dickens.txt | 23665 ++++++++++++++ crunch-test/src/main/resources/maugham.txt | 29112 ----------------- crunch-test/src/main/resources/shakes.txt | 382 - 34 files changed, 23739 insertions(+), 29866 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/LICENSE ---------------------------------------------------------------------- diff --git a/LICENSE b/LICENSE index 23c8577..ae4b6b6 100644 --- a/LICENSE +++ b/LICENSE @@ -240,304 +240,6 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --------------------------------------------------------------------------------- -Test cases use content provided by Project Gutenberg: - -THE FULL PROJECT GUTENBERG LICENSE - -PLEASE READ THIS BEFORE YOU DISTRIBUTE OR USE THIS WORK - -To protect the Project Gutenberg-tm mission of promoting the free distribution -of electronic works, by using or distributing this work (or any other work -associated in any way with the phrase "Project Gutenberg"), you agree to comply -with all the terms of the Full Project Gutenberg-tm License available with this -file or online at www.gutenberg.org/license. - -Section 1. General Terms of Use and Redistributing Project Gutenberg-tm -electronic works 1.A. By reading or using any part of this Project Gutenberg-tm -electronic work, you indicate that you have read, understand, agree to and -accept all the terms of this license and intellectual property -(trademark/copyright) agreement. If you do not agree to abide by all the terms -of this agreement, you must cease using and return or destroy all copies of -Project Gutenberg-tm electronic works in your possession. If you paid a fee for -obtaining a copy of or access to a Project Gutenberg-tm electronic work and you -do not agree to be bound by the terms of this agreement, you may obtain a -refund from the person or entity to whom you paid the fee as set forth in -paragraph 1.E.8. - -1.B. "Project Gutenberg" is a registered trademark. It may only be used on or -associated in any way with an electronic work by people who agree to be bound -by the terms of this agreement. There are a few things that you can do with -most Project Gutenberg-tm electronic works even without complying with the full -terms of this agreement. See paragraph 1.C below. There are a lot of things you -can do with Project Gutenberg-tm electronic works if you follow the terms of -this agreement and help preserve free future access to Project Gutenberg-tm -electronic works. See paragraph 1.E below. - -1.C. The Project Gutenberg Literary Archive Foundation ("the Foundation" or -PGLAF), owns a compilation copyright in the collection of Project Gutenberg-tm -electronic works. Nearly all the individual works in the collection are in the -public domain in the United States. If an individual work is in the public -domain in the United States and you are located in the United States, we do not -claim a right to prevent you from copying, distributing, performing, displaying -or creating derivative works based on the work as long as all references to -Project Gutenberg are removed. Of course, we hope that you will support the -Project Gutenberg-tm mission of promoting free access to electronic works by -freely sharing Project Gutenberg-tm works in compliance with the terms of this -agreement for keeping the Project Gutenberg-tm name associated with the work. -You can easily comply with the terms of this agreement by keeping this work in -the same format with its attached full Project Gutenberg-tm License when you -share it without charge with others. - -[*] This particular work is one of the few copyrighted individual works -included with the permission of the copyright holder. Information on the -copyright owner for this particular work and the terms of use imposed by the -copyright holder on this work are set forth at the beginning of this work. - -1.D. The copyright laws of the place where you are located also govern what you -can do with this work. Copyright laws in most countries are in a constant state -of change. If you are outside the United States, check the laws of your country -in addition to the terms of this agreement before downloading, copying, -displaying, performing, distributing or creating derivative works based on this -work or any other Project Gutenberg-tm work. The Foundation makes no -representations concerning the copyright status of any work in any country -outside the United States. - -1.E. Unless you have removed all references to Project Gutenberg: - -1.E.1. The following sentence, with active links to, or other immediate access -to, the full Project Gutenberg-tm License must appear prominently whenever any -copy of a Project Gutenberg-tm work (any work on which the phrase "Project -Gutenberg" appears, or with which the phrase "Project Gutenberg" is associated) -is accessed, displayed, performed, viewed, copied or distributed: - -This eBook is for the use of anyone anywhere at no cost and with almost no -restrictions whatsoever. You may copy it, give it away or re-use it under the -terms of the Project Gutenberg License included with this eBook or online at -www.gutenberg.org - -1.E.2. If an individual Project Gutenberg-tm electronic work is derived from -the public domain (does not contain a notice indicating that it is posted with -permission of the copyright holder), the work can be copied and distributed to -anyone in the United States without paying any fees or charges. If you are -redistributing or providing access to a work with the phrase "Project -Gutenberg" associated with or appearing on the work, you must comply either -with the requirements of paragraphs 1.E.1 through 1.E.7 or obtain permission -for the use of the work and the Project Gutenberg-tm trademark as set forth in -paragraphs 1.E.8 or 1.E.9. - -1.E.3. If an individual Project Gutenberg-tm electronic work is posted with the -permission of the copyright holder, your use and distribution must comply with -both paragraphs 1.E.1 through 1.E.7 and any additional terms imposed by the -copyright holder. Additional terms will be linked to the Project Gutenberg-tm -License for all works posted with the permission of the copyright holder found -at the beginning of this work. - -1.E.4. Do not unlink or detach or remove the full Project Gutenberg-tm License -terms from this work, or any files containing a part of this work or any other -work associated with Project Gutenberg-tm. - -1.E.5. Do not copy, display, perform, distribute or redistribute this -electronic work, or any part of this electronic work, without prominently -displaying the sentence set forth in paragraph 1.E.1 with active links or -immediate access to the full terms of the Project Gutenberg-tm License. - -1.E.6. You may convert to and distribute this work in any binary, compressed, -marked up, nonproprietary or proprietary form, including any word processing or -hypertext form. However, if you provide access to or distribute copies of a -Project Gutenberg-tm work in a format other than "Plain Vanilla ASCII" or other -format used in the official version posted on the official Project Gutenberg-tm -web site (www.gutenberg.org), you must, at no additional cost, fee or expense -to the user, provide a copy, a means of exporting a copy, or a means of -obtaining a copy upon request, of the work in its original "Plain Vanilla -ASCII" or other form. Any alternate format must include the full Project -Gutenberg-tm License as specified in paragraph 1.E.1. - -1.E.7. Do not charge a fee for access to, viewing, displaying, performing, -copying or distributing any Project Gutenberg-tm works unless you comply with -paragraph 1.E.8 or 1.E.9. - -1.E.8. You may charge a reasonable fee for copies of or providing access to or -distributing Project Gutenberg-tm electronic works provided that - -You pay a royalty fee of 20% of the gross profits you derive from the use of -Project Gutenberg-tm works calculated using the method you already use to -calculate your applicable taxes. The fee is owed to the owner of the Project -Gutenberg-tm trademark, but he has agreed to donate royalties under this -paragraph to the Project Gutenberg Literary Archive Foundation. Royalty -payments must be paid within 60 days following each date on which you prepare -(or are legally required to prepare) your periodic tax returns. Royalty -payments should be clearly marked as such and sent to the Project Gutenberg -Literary Archive Foundation at the address specified in Section 4, "Information -about donations to the Project Gutenberg Literary Archive Foundation." You -provide a full refund of any money paid by a user who notifies you in writing -(or by e-mail) within 30 days of receipt that s/he does not agree to the terms -of the full Project Gutenberg-tm License. You must require such a user to -return or destroy all copies of the works possessed in a physical medium and -discontinue all use of and all access to other copies of Project Gutenberg-tm -works. -You provide, in accordance with paragraph 1.F.3, a full refund of any money -paid for a work or a replacement copy, if a defect in the electronic work is -discovered and reported to you within 90 days of receipt of the work. You -comply with all other terms of this agreement for free distribution of Project -Gutenberg-tm works. - -1.E.9. If you wish to charge a fee or distribute a Project Gutenberg-tm -electronic work or group of works on different terms than are set forth in this -agreement, you must obtain permission in writing from both the Project -Gutenberg Literary Archive Foundation and Michael Hart, the owner of the -Project Gutenberg-tm trademark. Contact the Foundation as set forth in Section -3 below. - -1.F. - -1.F.1. Project Gutenberg volunteers and employees expend considerable effort to -identify, do copyright research on, transcribe and proofread public domain -works in creating the Project Gutenberg-tm collection. Despite these efforts, -Project Gutenberg-tm electronic works, and the medium on which they may be -stored, may contain "Defects," such as, but not limited to, incomplete, -inaccurate or corrupt data, transcription errors, a copyright or other -intellectual property infringement, a defective or damaged disk or other -medium, a computer virus, or computer codes that damage or cannot be read by -your equipment. - -1.F.2. LIMITED WARRANTY, DISCLAIMER OF DAMAGES - Except for the "Right of -Replacement or Refund" described in paragraph 1.F.3, the Project Gutenberg -Literary Archive Foundation, the owner of the Project Gutenberg-tm trademark, -and any other party distributing a Project Gutenberg-tm electronic work under -this agreement, disclaim all liability to you for damages, costs and expenses, -including legal fees. YOU AGREE THAT YOU HAVE NO REMEDIES FOR NEGLIGENCE, -STRICT LIABILITY, BREACH OF WARRANTY OR BREACH OF CONTRACT EXCEPT THOSE -PROVIDED IN PARAGRAPH 1.F.3. YOU AGREE THAT THE FOUNDATION, THE TRADEMARK -OWNER, AND ANY DISTRIBUTOR UNDER THIS AGREEMENT WILL NOT BE LIABLE TO YOU FOR -ACTUAL, DIRECT, INDIRECT, CONSEQUENTIAL, PUNITIVE OR INCIDENTAL DAMAGES EVEN IF -YOU GIVE NOTICE OF THE POSSIBILITY OF SUCH DAMAGE. - -1.F.3. LIMITED RIGHT OF REPLACEMENT OR REFUND - If you discover a defect in -this electronic work within 90 days of receiving it, you can receive a refund -of the money (if any) you paid for it by sending a written explanation to the -person you received the work from. If you received the work on a physical -medium, you must return the medium with your written explanation. The person or -entity that provided you with the defective work may elect to provide a -replacement copy in lieu of a refund. If you received the work electronically, -the person or entity providing it to you may choose to give you a second -opportunity to receive the work electronically in lieu of a refund. If the -second copy is also defective, you may demand a refund in writing without -further opportunities to fix the problem. - -1.F.4. Except for the limited right of replacement or refund set forth in -paragraph 1.F.3, this work is provided to you 'AS-IS', WITH NO OTHER WARRANTIES -OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES OF -MERCHANTABILITY OR FITNESS FOR ANY PURPOSE. - -1.F.5. Some states do not allow disclaimers of certain implied warranties or -the exclusion or limitation of certain types of damages. If any disclaimer or -limitation set forth in this agreement violates the law of the state applicable -to this agreement, the agreement shall be interpreted to make the maximum -disclaimer or limitation permitted by the applicable state law. The invalidity -or unenforceability of any provision of this agreement shall not void the -remaining provisions. - -1.F.6. INDEMNITY - You agree to indemnify and hold the Foundation, the -trademark owner, any agent or employee of the Foundation, anyone providing -copies of Project Gutenberg-tm electronic works in accordance with this -agreement, and any volunteers associated with the production, promotion and -distribution of Project Gutenberg-tm electronic works, harmless from all -liability, costs and expenses, including legal fees, that arise directly or -indirectly from any of the following which you do or cause to occur: (a) -distribution of this or any Project Gutenberg-tm work, (b) alteration, -modification, or additions or deletions to any Project Gutenberg-tm work, and -(c) any Defect you cause. - -Section 2. Information about the Mission of Project Gutenberg-tm Project -Gutenberg-tm is synonymous with the free distribution of electronic works in -formats readable by the widest variety of computers including obsolete, old, -middle-aged and new computers. It exists because of the efforts of hundreds of -volunteers and donations from people in all walks of life. - -Volunteers and financial support to provide volunteers with the assistance they -need are critical to reaching Project Gutenberg-tm's goals and ensuring that -the Project Gutenberg-tm collection will remain freely available for -generations to come. In 2001, the Project Gutenberg Literary Archive Foundation -was created to provide a secure and permanent future for Project Gutenberg-tm -and future generations. To learn more about the Project Gutenberg Literary -Archive Foundation and how your efforts and donations can help, see Sections 3 -and 4 and the Foundation information page at www.gutenberg.org - -Section 3. Information about the Project Gutenberg Literary Archive Foundation -The Project Gutenberg Literary Archive Foundation is a non profit 501(c)(3) -educational corporation organized under the laws of the state of Mississippi -and granted tax exempt status by the Internal Revenue Service. The Foundation's -EIN or federal tax identification number is 64-6221541. Contributions to the -Project Gutenberg Literary Archive Foundation are tax deductible to the full -extent permitted by U.S. federal laws and your state's laws. - -The Foundation's principal office is located at 4557 Melan Dr. S. Fairbanks, -AK, 99712., but its volunteers and employees are scattered throughout numerous -locations. Its business office is located at 809 North 1500 West, Salt Lake -City, UT 84116, (801) 596-1887. Email contact links and up to date contact -information can be found at the Foundation's web site and official page at -www.gutenberg.org/contact - -For additional contact information: - - Dr. Gregory B. Newby - Chief Executive and Director - [email protected] - -Section 4. Information about Donations to the Project Gutenberg Literary -Archive Foundation Project Gutenberg-tm depends upon and cannot survive without -wide spread public support and donations to carry out its mission of increasing -the number of public domain and licensed works that can be freely distributed -in machine readable form accessible by the widest array of equipment including -outdated equipment. Many small donations ($1 to $5,000) are particularly -important to maintaining tax exempt status with the IRS. - -The Foundation is committed to complying with the laws regulating charities and -charitable donations in all 50 states of the United States. Compliance -requirements are not uniform and it takes a considerable effort, much paperwork -and many fees to meet and keep up with these requirements. We do not solicit -donations in locations where we have not received written confirmation of -compliance. To SEND DONATIONS or determine the status of compliance for any -particular state visit www.gutenberg.org/donate - -While we cannot and do not solicit contributions from states where we have not -met the solicitation requirements, we know of no prohibition against accepting -unsolicited donations from donors in such states who approach us with offers to -donate. - -International donations are gratefully accepted, but we cannot make any -statements concerning tax treatment of donations received from outside the -United States. U.S. laws alone swamp our small staff. - -Please check the Project Gutenberg Web pages for current donation methods and -addresses. Donations are accepted in a number of other ways including checks, -online payments and credit card donations. To donate, please visit: -www.gutenberg.org/donate - -Section 5. General Information About Project Gutenberg-tm electronic works. -Professor Michael S. Hart was the originator of the Project Gutenberg-tm -concept of a library of electronic works that could be freely shared with -anyone. For forty years, he produced and distributed Project Gutenberg-tm -eBooks with only a loose network of volunteer support. - -Project Gutenberg-tm eBooks are often created from several printed editions, -all of which are confirmed as Public Domain in the U.S. unless a copyright -notice is included. Thus, we do not necessarily keep eBooks in compliance with -any particular paper edition. - -Most people start at our Web site which has the main PG search facility: -www.gutenberg.org - -This Web site includes information about Project Gutenberg-tm, including how to -make donations to the Project Gutenberg Literary Archive Foundation, how to -help produce our new eBooks, and how to subscribe to our email newsletter to -hear about new eBooks. - -[*] This paragraph, after 1.C., is included only for copyrighted works. For -those, you must contact the copyright holder before any non-free use or removal -of the Project Gutenberg header. - ================================================================================ The binary distribution for Apache Crunch includes the following http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java index 9d6f682..563af07 100644 --- a/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java @@ -41,7 +41,7 @@ import com.google.common.io.Files; */ public class CleanTextIT { - private static final int LINES_IN_SHAKES = 3667; + private static final int LINES_IN_SHAKES = 3285; @Rule public TemporaryPath tmpDir = TemporaryPaths.create(); http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java index 7e0c75c..08e5ac2 100644 --- a/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java @@ -37,10 +37,10 @@ import org.junit.Test; @SuppressWarnings("serial") public class CollectionPObjectIT { - private static final int LINES_IN_SHAKES = 3667; + private static final int LINES_IN_SHAKES = 3285; private static final String FIRST_SHAKESPEARE_LINE = - "***The Project Gutenberg's Etext of Shakespeare's First Folio***"; + "The Tragedie of Macbeth"; private static final String LAST_SHAKESPEARE_LINE = "FINIS. THE TRAGEDIE OF MACBETH."; http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java index f1a33a2..f676bab 100644 --- a/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java @@ -34,7 +34,7 @@ import org.junit.Test; @SuppressWarnings("serial") public class CollectionsLengthIT { - public static final Long LINES_IN_SHAKESPEARE = 3667L; + public static final Long LINES_IN_SHAKESPEARE = 3285L; @Rule public TemporaryPath tmpDir = TemporaryPaths.create(); @@ -64,6 +64,6 @@ public class CollectionsLengthIT { PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath); Long length = shakespeare.length().getValue(); - assertEquals("Incorrect length for shakespear PCollection.", LINES_IN_SHAKESPEARE, length); + assertEquals("Incorrect length for Shakespeare PCollection.", LINES_IN_SHAKESPEARE, length); } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java index f1323ca..54f9917 100644 --- a/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java @@ -54,7 +54,7 @@ public class DeepCopyCustomTuplesIT { .groupByKey() .parallelDo(new PostProcFn(), strings()) .materialize(); - assertEquals(65, Iterables.size(out)); + assertEquals(59, Iterables.size(out)); p.done(); } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java index d985e10..a016c12 100644 --- a/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java @@ -36,7 +36,7 @@ import org.junit.Test; public class FirstElementPObjectIT { private static final String FIRST_SHAKESPEARE_LINE = - "***The Project Gutenberg's Etext of Shakespeare's First Folio***"; + "The Tragedie of Macbeth"; @Rule public TemporaryPath tmpDir = TemporaryPaths.create(); http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java index 6ee849f..42c046a 100644 --- a/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java @@ -37,7 +37,7 @@ import org.junit.Test; @SuppressWarnings("serial") public class PObjectsIT { - private static final Integer LINES_IN_SHAKES = 3667; + private static final Integer LINES_IN_SHAKES = 3285; @Rule public TemporaryPath tmpDir = TemporaryPaths.create(); http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java b/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java index 95638a1..ff5dc60 100644 --- a/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/PipelineCallableIT.java @@ -95,7 +95,7 @@ public class PipelineCallableIT { assertFalse(p.run().succeeded()); } else { Map<String, Long> counts = top3.materializeToMap(); - assertEquals(ImmutableMap.of("", 788L, "Enter Macbeth.", 7L, "Exeunt.", 21L), counts); + assertEquals(ImmutableMap.of("", 697L, "Enter.", 7L, "Exeunt.", 21L), counts); assertEquals(17, INC1); assertEquals(29, INC2); } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java b/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java index 8c4c57f..3a82a19 100644 --- a/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/RecordDropIT.java @@ -54,7 +54,7 @@ public class RecordDropIT { } int index = 0; for (Iterable<Integer> iter : values) { - assertEquals("Checking index = " + index, 3667, Iterables.getFirst(iter, 0).intValue()); + assertEquals("Checking index = " + index, 3285, Iterables.getFirst(iter, 0).intValue()); index++; } p.done(); http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java index e74c166..45f3afd 100644 --- a/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java @@ -89,7 +89,7 @@ public class StageResultsCountersIT { Map<String, Long> keywordsMap = countersToMap(result.getStageResults(), KEYWORDS_COUNTER_GROUP); - assertThat(keywordsMap, is((Map<String, Long>) ImmutableMap.of("NOT", 157L, "AND", 596L, "OR", 81L))); + assertThat(keywordsMap, is((Map<String, Long>) ImmutableMap.of("NOT", 145L, "AND", 544L, "OR", 37L))); } private static PipelineResult coutSpecialKeywords(Pipeline pipeline, String inputFileName, PTypeFamily tf) { http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java index e0bd719..257c917 100644 --- a/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java @@ -150,8 +150,8 @@ public class WordCountIT { PTable<String, Long> wordCount = wordCount(shakespeare, tf); List<Pair<String, Long>> top5 = Lists.newArrayList(Aggregate.top(wordCount, 5, true).materialize()); assertEquals( - ImmutableList.of(Pair.of("", 1470L), Pair.of("the", 620L), Pair.of("and", 427L), Pair.of("of", 396L), - Pair.of("to", 367L)), top5); + ImmutableList.of(Pair.of("", 1345L), Pair.of("the", 528L), Pair.of("and", 375L), Pair.of("I", 314L), + Pair.of("of", 314L)), top5); } public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException { @@ -191,14 +191,14 @@ public class WordCountIT { assertEquals(2, stageResults.size()); } else { assertEquals(1, stageResults.size()); - assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS)); + assertEquals(375, stageResults.get(0).getCounterValue(WordCountStats.ANDS)); } File outputFile = new File(outputPath, "part-r-00000"); List<String> lines = Files.readLines(outputFile, Charset.defaultCharset()); boolean passed = false; for (String line : lines) { - if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) { + if (line.startsWith("Macbeth\t") || line.startsWith("[Macbeth,")) { passed = true; break; } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java b/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java index 98ae8d1..c33348a 100644 --- a/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/impl/mr/plan/DotfilesIT.java @@ -159,13 +159,13 @@ public class DotfilesIT { List<PipelineResult.StageResult> stageResults = res.getStageResults(); assertEquals(1, stageResults.size()); - assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS)); + assertEquals(375, stageResults.get(0).getCounterValue(WordCountStats.ANDS)); File outputFile = new File(outputPath, "part-r-00000"); List<String> lines = Files.readLines(outputFile, Charset.defaultCharset()); boolean passed = false; for (String line : lines) { - if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) { + if (line.startsWith("Macbeth\t") || line.startsWith("[Macbeth,")) { passed = true; break; } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java index 7c09790..6feff1f 100644 --- a/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/lib/MapredIT.java @@ -109,7 +109,7 @@ public class MapredIT extends CrunchTestSupport implements Serializable { PipelineResult res = p.done(); assertEquals(1, res.getStageResults().size()); StageResult sr = res.getStageResults().get(0); - assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue()); + assertEquals(3285, sr.getCounters().findCounter("written", "out").getValue()); } @Test @@ -129,6 +129,6 @@ public class MapredIT extends CrunchTestSupport implements Serializable { PipelineResult res = p.done(); assertEquals(1, res.getStageResults().size()); StageResult sr = res.getStageResults().get(0); - assertEquals(108, sr.getCounters().findCounter("thou", "count").getValue()); + assertEquals(103, sr.getCounters().findCounter("thou", "count").getValue()); } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java index ab453e0..9510457 100644 --- a/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/lib/MapreduceIT.java @@ -95,7 +95,7 @@ public class MapreduceIT extends CrunchTestSupport implements Serializable { PipelineResult res = p.done(); assertEquals(1, res.getStageResults().size()); StageResult sr = res.getStageResults().get(0); - assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue()); + assertEquals(3285, sr.getCounters().findCounter("written", "out").getValue()); } @Test http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java index 24e67b5..77edd8b 100644 --- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractFullOuterJoinIT.java @@ -28,13 +28,13 @@ public abstract class AbstractFullOuterJoinIT extends JoinTester { boolean passed2 = false; boolean passed3 = false; for (Pair<String, Long> line : lines) { - if ("wretched".equals(line.first()) && 24 == line.second()) { + if ("wretched".equals(line.first()) && 19 == line.second()) { passed1 = true; } if ("againe".equals(line.first()) && 10 == line.second()) { passed2 = true; } - if ("Montparnasse.".equals(line.first()) && 2 == line.second()) { + if ("moon".equals(line.first()) && 9 == line.second()) { passed3 = true; } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java index 8ceaa03..a13ff27 100644 --- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractInnerJoinIT.java @@ -28,13 +28,13 @@ public abstract class AbstractInnerJoinIT extends JoinTester { boolean passed2 = true; boolean passed3 = true; for (Pair<String, Long> line : lines) { - if ("wretched".equals(line.first()) && 24 == line.second()) { + if ("wretched".equals(line.first()) && 19 == line.second()) { passed1 = true; } if ("againe".equals(line.first())) { passed2 = false; } - if ("Montparnasse.".equals(line.first())) { + if ("moon".equals(line.first())) { passed3 = false; } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java index 241f5ad..43b4118 100644 --- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractLeftOuterJoinIT.java @@ -28,13 +28,13 @@ public abstract class AbstractLeftOuterJoinIT extends JoinTester { boolean passed2 = false; boolean passed3 = true; for (Pair<String, Long> line : lines) { - if ("wretched".equals(line.first()) && 24 == line.second()) { + if ("wretched".equals(line.first()) && 19 == line.second()) { passed1 = true; } if ("againe".equals(line.first()) && 10 == line.second()) { passed2 = true; } - if ("Montparnasse.".equals(line.first())) { + if ("moon".equals(line.first())) { passed3 = false; } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java index 43e0479..e5e7b4e 100644 --- a/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java +++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/AbstractRightOuterJoinIT.java @@ -28,13 +28,13 @@ public abstract class AbstractRightOuterJoinIT extends JoinTester { boolean passed2 = true; boolean passed3 = false; for (Pair<String, Long> line : lines) { - if ("wretched".equals(line.first()) && 24 == line.second()) { + if ("wretched".equals(line.first()) && 19 == line.second()) { passed1 = true; } if ("againe".equals(line.first())) { passed2 = false; } - if ("Montparnasse.".equals(line.first()) && 2 == line.second()) { + if ("moon".equals(line.first()) && 9 == line.second()) { passed3 = true; } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java index 700cba5..3ada7e0 100644 --- a/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java +++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java @@ -72,11 +72,11 @@ public abstract class JoinTester implements Serializable { protected void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException { String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt"); - String maughamInputPath = tmpDir.copyResourceFileName("maugham.txt"); + String dickensInputPath = tmpDir.copyResourceFileName("dickens.txt"); PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath); - PCollection<String> maugham = pipeline.readTextFile(maughamInputPath); - PTable<String, Long> joined = join(shakespeare, maugham, typeFamily); + PCollection<String> dickens = pipeline.readTextFile(dickensInputPath); + PTable<String, Long> joined = join(shakespeare, dickens, typeFamily); Iterable<Pair<String, Long>> lines = joined.materialize(); assertPassed(lines); http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java ---------------------------------------------------------------------- diff --git a/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java b/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java index af24865..9027c1b 100644 --- a/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java +++ b/crunch-hbase/src/it/java/org/apache/crunch/io/hbase/HFileTargetIT.java @@ -196,7 +196,7 @@ public class HFileTargetIT implements Serializable { FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration()); KeyValue kv = readFromHFiles(fs, outputPath, "and"); - assertEquals(427L, Bytes.toLong(kv.getValue())); + assertEquals(375L, Bytes.toLong(kv.getValue())); } @Test @@ -223,11 +223,11 @@ public class HFileTargetIT implements Serializable { .doBulkLoad(outputPath, testTable); Map<String, Long> EXPECTED = ImmutableMap.<String, Long>builder() - .put("__EMPTY__", 1470L) - .put("the", 620L) - .put("and", 427L) - .put("of", 396L) - .put("to", 367L) + .put("__EMPTY__", 1345L) + .put("the", 528L) + .put("and", 375L) + .put("I", 314L) + .put("of", 314L) .build(); for (Map.Entry<String, Long> e : EXPECTED.entrySet()) { @@ -270,8 +270,8 @@ public class HFileTargetIT implements Serializable { loader.doBulkLoad(outputPath1, table1); loader.doBulkLoad(outputPath2, table2); - assertEquals(396L, getWordCountFromTable(table1, "of")); - assertEquals(427L, getWordCountFromTable(table2, "and")); + assertEquals(314L, getWordCountFromTable(table1, "of")); + assertEquals(375L, getWordCountFromTable(table2, "and")); } @Test http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala index 94a6e12..c79783d 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/AggregatorsIntegrationTest.scala @@ -34,7 +34,7 @@ class AggregatorsIntegrationTest extends CrunchSuite { .groupByKey .combineValues(Aggregators.product[(Long, Int)](Aggregators.sum[Long], Aggregators.max[Int])) .materialize - assert(fcc.exists(_ == ("w", (1404, 12)))) + assert(fcc.exists(_ == ("w", (1302, 12)))) pipeline.done } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala index c7e53ae..fb994ca 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/CogroupTest.scala @@ -31,10 +31,10 @@ class CogroupTest extends CrunchSuite { @Test def cogroup { val shakespeare = tempDir.copyResourceFileName("shakes.txt") - val maugham = tempDir.copyResourceFileName("maugham.txt") - val diffs = wordCount(shakespeare).cogroup(wordCount(maugham)) + val dickens = tempDir.copyResourceFileName("dickens.txt") + val diffs = wordCount(shakespeare).cogroup(wordCount(dickens)) .map((k, v) => (k, (v._1.sum - v._2.sum))).materialize - assert(diffs.exists(_ == ("the", -11390))) + assert(diffs.exists(_ == ("the", -11043))) pipeline.done } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala index 44aa9a8..d480d22 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/IncrementTest.scala @@ -46,9 +46,9 @@ class IncrementTest extends CrunchSuite { val res = pipeline.done() val sr0 = res.getStageResults.get(0) - assertEquals(21836, sr0.getCounterValue("TOP", "ALLWORDS")) - assertEquals(20366, sr0.getCounterValue("TOP", "NONEMPTY")) - assertEquals(3604, sr0.getCounterValue("TOP", "AWORDS_2x")) - assertEquals(20366, sr0.getCounterValue("Inc", "A")) + assertEquals(19082, sr0.getCounterValue("TOP", "ALLWORDS")) + assertEquals(17737, sr0.getCounterValue("TOP", "NONEMPTY")) + assertEquals(3088, sr0.getCounterValue("TOP", "AWORDS_2x")) + assertEquals(17737, sr0.getCounterValue("Inc", "A")) } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala index 35a6500..8947ce6 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/JoinTest.scala @@ -34,25 +34,25 @@ class JoinTest extends CrunchSuite { @Test def join { val shakespeare = tempDir.copyResourceFileName("shakes.txt") - val maugham = tempDir.copyResourceFileName("maugham.txt") + val dickens = tempDir.copyResourceFileName("dickens.txt") val output = tempDir.getFile("output") - val filtered = wordCount(shakespeare).join(wordCount(maugham)) + val filtered = wordCount(shakespeare).join(wordCount(dickens)) .map((k, v) => (k, v._1 - v._2)) .write(to.textFile(output.getAbsolutePath())) .filter((k, d) => d > 0).materialize - assert(filtered.exists(_ == ("macbeth", 66))) + assert(filtered.exists(_ == ("noble", 9))) pipeline.done } @Test def joinMapside { val shakespeare = tempDir.copyResourceFileName("shakes.txt") - val maugham = tempDir.copyResourceFileName("maugham.txt") + val dickens = tempDir.copyResourceFileName("dickens.txt") val output = tempDir.getFile("output") - val filtered = wordCount(shakespeare).innerJoinUsing(wordCount(maugham), Joins.mapside()) + val filtered = wordCount(shakespeare).innerJoinUsing(wordCount(dickens), Joins.mapside()) .map((k, v) => (k, v._1 - v._2)) .write(to.textFile(output.getAbsolutePath())) .filter((k, d) => d > 0).materialize - assert(filtered.exists(_ == ("macbeth", 66))) + assert(filtered.exists(_ == ("noble", 9))) pipeline.done } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala index 3c232b1..b81165f 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala @@ -30,11 +30,11 @@ import org.scalatest.junit.JUnitSuite class PCollectionTest extends CrunchSuite { // Number of lines in the Shakespeare data set. - val linesInShakespeare: Int = 3667 + val linesInShakespeare: Int = 3285 // The first line in the Shakespeare data set. val firstLineInShakespeare: String = - "***The Project Gutenberg's Etext of Shakespeare's First Folio***" + "The Tragedie of Macbeth" // The last line in the Shakespeare data set. val lastLineInShakespeare: String = @@ -79,6 +79,6 @@ class PCollectionTest extends CrunchSuite { // With a seed of 1L, 380 elements should be sampled. val sampledCollection = shakespeare.sample(0.10, 1L) val length = sampledCollection.length().value() - assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, length) + assertEquals("Incorrect number of elements sampled with seed 1L.", 338L, length) } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala index c566e59..c5a56fc 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PipelineAppTest.scala @@ -40,7 +40,7 @@ class PipelineAppTest extends CrunchSuite { @Test def run { val args = new Array[String](3) args(0) = tempDir.copyResourceFileName("shakes.txt") - args(1) = tempDir.copyResourceFileName("maugham.txt") + args(1) = tempDir.copyResourceFileName("dickens.txt") args(2) = tempDir.getFileName("output") tempDir.overridePathProperties(WordCount.configuration) WordCount.main(args) http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala index 186ec27..416251b 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/TopTest.scala @@ -35,6 +35,6 @@ class TopTest extends CrunchSuite { val wc = pipeline.read(from.textFile(input)) .flatMap(_.toLowerCase.split("\\s+")) .filter(!_.isEmpty()).count - assert(wc.top(10, true).materialize.exists(_ == ("is", 205))) + assert(wc.top(10, true).materialize.exists(_ == ("is", 175))) } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala index f62cef3..aebd2df 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/UnionTest.scala @@ -30,21 +30,21 @@ class UnionTest extends CrunchSuite { @Test def testUnionCollection { val shakespeare = tempDir.copyResourceFileName("shakes.txt") - val maugham = tempDir.copyResourceFileName("maugham.txt") + val dickens = tempDir.copyResourceFileName("dickens.txt") val union = pipeline.read(from.textFile(shakespeare)).union( - pipeline.read(from.textFile(maugham))) + pipeline.read(from.textFile(dickens))) val wc = wordCount(union).materialize - assert(wc.exists(_ == ("you", 3691))) + assert(wc.exists(_ == ("you", 2552))) pipeline.done } @Test def testUnionTable { val shakespeare = tempDir.copyResourceFileName("shakes.txt") - val maugham = tempDir.copyResourceFileName("maugham.txt") + val dickens = tempDir.copyResourceFileName("dickens.txt") val wcs = wordCount(pipeline.read(from.textFile(shakespeare))) - val wcm = wordCount(pipeline.read(from.textFile(maugham))) + val wcm = wordCount(pipeline.read(from.textFile(dickens))) val wc = wcs.union(wcm).groupByKey.combine(v => v.sum).materialize - assert(wc.exists(_ == ("you", 3691))) + assert(wc.exists(_ == ("you", 2552))) pipeline.done } } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala index 7ee4de0..bac56f9 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/WordCountTest.scala @@ -33,7 +33,7 @@ class WordCountTest extends CrunchSuite { .write(to.textFile(wordCountOut)) // Word counts .map((w, c) => (w.slice(0, 1), c)) .groupByKey.combine(v => v.sum).materialize - assert(fcc.exists(_ == ("w", 1404))) + assert(fcc.exists(_ == ("w", 1302))) pipeline.done } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java ---------------------------------------------------------------------- diff --git a/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java b/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java index 8126e81..815aaff 100644 --- a/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java +++ b/crunch-spark/src/it/java/org/apache/crunch/SparkHFileTargetIT.java @@ -170,7 +170,7 @@ public class SparkHFileTargetIT implements Serializable { FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration()); KeyValue kv = readFromHFiles(fs, outputPath, "and"); - assertEquals(427L, Bytes.toLong(kv.getValue())); + assertEquals(375L, Bytes.toLong(kv.getValue())); pipeline.done(); } @@ -199,11 +199,11 @@ public class SparkHFileTargetIT implements Serializable { .doBulkLoad(outputPath, testTable); Map<String, Long> EXPECTED = ImmutableMap.<String, Long>builder() - .put("__EMPTY__", 1470L) - .put("the", 620L) - .put("and", 427L) - .put("of", 396L) - .put("to", 367L) + .put("__EMPTY__", 1345L) + .put("the", 528L) + .put("and", 375L) + .put("I", 314L) + .put("of", 314L) .build(); for (Map.Entry<String, Long> e : EXPECTED.entrySet()) { @@ -246,8 +246,8 @@ public class SparkHFileTargetIT implements Serializable { loader.doBulkLoad(outputPath1, table1); loader.doBulkLoad(outputPath2, table2); - assertEquals(396L, getWordCountFromTable(table1, "of")); - assertEquals(427L, getWordCountFromTable(table2, "and")); + assertEquals(314L, getWordCountFromTable(table1, "of")); + assertEquals(375L, getWordCountFromTable(table2, "and")); pipeline.done(); } http://git-wip-us.apache.org/repos/asf/crunch/blob/5d237b36/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java ---------------------------------------------------------------------- diff --git a/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java b/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java index d799842..de0f893 100644 --- a/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java +++ b/crunch-spark/src/it/java/org/apache/crunch/SparkPipelineCallableIT.java @@ -90,7 +90,7 @@ public class SparkPipelineCallableIT extends CrunchTestSupport { assertFalse(p.run().succeeded()); } else { Map<String, Long> counts = top3.materializeToMap(); - assertEquals(ImmutableMap.of("", 788L, "Enter Macbeth.", 7L, "Exeunt.", 21L), counts); + assertEquals(ImmutableMap.of("", 697L, "Enter.", 7L, "Exeunt.", 21L), counts); assertEquals(17, INC1); assertEquals(29, INC2); }
