[ https://issues.apache.org/jira/browse/MAHOUT-235?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12795704#action_12795704 ]
Ted Dunning edited comment on MAHOUT-235 at 12/31/09 11:24 PM: --------------------------------------------------------------- I just looked at this. Didn't find the problem, but did make some comments in the code and clarified it a bit. My guess is that the main partition loop leaves b and c in the wrong order. If i get another chance to look at this, I will put in some assertions that match what I think the loop invariants should be. That will probably make things obvious. Here is my commented version fo the key sort routine: {noformat} private static void quickSort0(int start, int end, IntComparator comp, Swapper swap) { int length = end - start; if (length < 7) { insertionSort(start, end, comp, swap); return; } int middle = (start + end) / 2; if (length > 7) { int bottom = start; int top = end - 1; if (length > 40) { // for lots of data, bottom, middle and top are medians near the beginning, middle or end of the data int skosh = length / 8; bottom = med3(bottom, bottom + skosh, bottom + (2 * skosh), comp); middle = med3(middle - skosh, middle, middle + skosh, comp); top = med3(top - (2 * skosh), top - skosh, top, comp); } middle = med3(bottom, middle, top, comp); } int partitionIndex = middle; // an index, not a value. // regions from a to b and from c to d are what we will recursively sort int a, b, c, d; a = b = start; c = d = end - 1; while (b <= c) { // copy all values equal to the partition value to before a..b. In the process, advance b // as long as values less than the partition or equal are found, also stop when a..b collides with c..d int comparison = comp.compare(b, partitionIndex); while (b <= c && comparison <= 0) { if (comparison == 0) { swap.swap(a, b); a++; } b++; comparison = comp.compare(b, partitionIndex); } // at this point [start..a) has partition values, [a..b) has values < partition // also, either b>c or v[b] > partition value comparison = comp.compare(c, partitionIndex); while (c >= b && comparison >= 0) { if (comparison == 0) { swap.swap(c, d); d--; } c--; comparison = comp.compare(c, partitionIndex); } // now we also know that [d..end] contains partition values, // [c..d) contains values > partition value // also, either b>c or (v[b] > partition OR v[c] < partition) if (b <= c) { // v[b] > partition OR v[c] < partition // swapping will let us continue to grow the two regions swap.swap(b, c); b++; c--; } } // now we know // b = c+1 // [start..a) and [d..end) contain partition value // all of [a..b) are less than partition // all of [c..d) are greater than partition // shift [a..b) to beginning length = Math.min(a - start, b - a); int l = start; int h = b - length; while (length-- > 0) { swap.swap(l, h); l++; h++; } // shift [c..d) to end length = Math.min(d - c, end - 1 - d); l = b; h = end - length; while (length-- > 0) { swap.swap(l, h); l++; h++; } // recurse left and right length = b - a; if (length > 0) { quickSort0(start, start + length, comp, swap); } length = d - c; if (length > 0) { quickSort0(end - length, end, comp, swap); } } /** * In-place insertion sort that is fast for pre-sorted data. * * @param start Where to start sorting (inclusive) * @param end Where to stop (exclusive) * @param comp Sort order. * @param swap How to swap items. */ private static void insertionSort(int start, int end, IntComparator comp, Swapper swap) { for (int i = start + 1; i < end; i++) { for (int j = i; j > start && comp.compare(j - 1, j) > 0; j--) { swap.swap(j - 1, j); } } } {noformat} was (Author: tdunning): I just looked at this. Didn't find the problem, but did make some comments in the code and clarified it a bit. My guess is that the main partition loop leaves b and c in the wrong order. If i get another chance to look at this, I will put in some assertions that match what I think the loop invariants should be. That will probably make things obvious. Here is my commented version fo the key sort routine: {unformatted} private static void quickSort0(int start, int end, IntComparator comp, Swapper swap) { int length = end - start; if (length < 7) { insertionSort(start, end, comp, swap); return; } int middle = (start + end) / 2; if (length > 7) { int bottom = start; int top = end - 1; if (length > 40) { // for lots of data, bottom, middle and top are medians near the beginning, middle or end of the data int skosh = length / 8; bottom = med3(bottom, bottom + skosh, bottom + (2 * skosh), comp); middle = med3(middle - skosh, middle, middle + skosh, comp); top = med3(top - (2 * skosh), top - skosh, top, comp); } middle = med3(bottom, middle, top, comp); } int partitionIndex = middle; // an index, not a value. // regions from a to b and from c to d are what we will recursively sort int a, b, c, d; a = b = start; c = d = end - 1; while (b <= c) { // copy all values equal to the partition value to before a..b. In the process, advance b // as long as values less than the partition or equal are found, also stop when a..b collides with c..d int comparison = comp.compare(b, partitionIndex); while (b <= c && comparison <= 0) { if (comparison == 0) { swap.swap(a, b); a++; } b++; comparison = comp.compare(b, partitionIndex); } // at this point [start..a) has partition values, [a..b) has values < partition // also, either b>c or v[b] > partition value comparison = comp.compare(c, partitionIndex); while (c >= b && comparison >= 0) { if (comparison == 0) { swap.swap(c, d); d--; } c--; comparison = comp.compare(c, partitionIndex); } // now we also know that [d..end] contains partition values, // [c..d) contains values > partition value // also, either b>c or (v[b] > partition OR v[c] < partition) if (b <= c) { // v[b] > partition OR v[c] < partition // swapping will let us continue to grow the two regions swap.swap(b, c); b++; c--; } } // now we know // b = c+1 // [start..a) and [d..end) contain partition value // all of [a..b) are less than partition // all of [c..d) are greater than partition // shift [a..b) to beginning length = Math.min(a - start, b - a); int l = start; int h = b - length; while (length-- > 0) { swap.swap(l, h); l++; h++; } // shift [c..d) to end length = Math.min(d - c, end - 1 - d); l = b; h = end - length; while (length-- > 0) { swap.swap(l, h); l++; h++; } // recurse left and right length = b - a; if (length > 0) { quickSort0(start, start + length, comp, swap); } length = d - c; if (length > 0) { quickSort0(end - length, end, comp, swap); } } /** * In-place insertion sort that is fast for pre-sorted data. * * @param start Where to start sorting (inclusive) * @param end Where to stop (exclusive) * @param comp Sort order. * @param swap How to swap items. */ private static void insertionSort(int start, int end, IntComparator comp, Swapper swap) { for (int i = start + 1; i < end; i++) { for (int j = i; j > start && comp.compare(j - 1, j) > 0; j--) { swap.swap(j - 1, j); } } } {unformatted} > GenericSorting.java also needs replacing > ---------------------------------------- > > Key: MAHOUT-235 > URL: https://issues.apache.org/jira/browse/MAHOUT-235 > Project: Mahout > Issue Type: Bug > Components: Math > Affects Versions: 0.3 > Reporter: Benson Margulies > Attachments: oops.diff > > > It looks like GenericSorting.java is more code of the same dubious parentage > that needs the same treatment. -- This message is automatically generated by JIRA. - You can reply to this email to add a comment to the issue online.