[
https://issues.apache.org/jira/browse/TAJO-966?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14084313#comment-14084313
]
ASF GitHub Bot commented on TAJO-966:
-------------------------------------
Github user jihoonson commented on a diff in the pull request:
https://github.com/apache/tajo/pull/91#discussion_r15740995
--- Diff:
tajo-core/src/main/java/org/apache/tajo/engine/planner/UniformRangePartition.java
---
@@ -94,25 +100,81 @@ public UniformRangePartition(TupleRange range,
SortSpec [] sortSpecs) {
}
List<TupleRange> ranges = Lists.newArrayList();
- BigDecimal term = reverseCardsForDigit[0].divide(
- new BigDecimal(partNum), RoundingMode.CEILING);
- BigDecimal reminder = reverseCardsForDigit[0];
- Tuple last = range.getStart();
- while(reminder.compareTo(new BigDecimal(0)) > 0) {
+
+ BigDecimal x = new BigDecimal(reverseCardsForDigit[0]);
+
+ BigInteger term = x.divide(BigDecimal.valueOf(partNum),
RoundingMode.CEILING).toBigInteger();
+ BigInteger reminder = reverseCardsForDigit[0];
+ Tuple last = mergedRange.getStart();
+ TupleRange tupleRange;
+ while(reminder.compareTo(BigInteger.ZERO) > 0) {
if (reminder.compareTo(term) <= 0) { // final one is inclusive
- ranges.add(new TupleRange(sortSpecs, last, range.getEnd()));
+ tupleRange = new TupleRange(sortSpecs, last, mergedRange.getEnd());
} else {
- Tuple next = increment(last, term.longValue(), variableId);
- ranges.add(new TupleRange(sortSpecs, last, next));
+ Tuple next = increment(last, term, variableId);
+ tupleRange = new TupleRange(sortSpecs, last, next);
}
+
+ ranges.add(tupleRange);
last = ranges.get(ranges.size() - 1).getEnd();
reminder = reminder.subtract(term);
}
+ for (TupleRange r : ranges) {
+ denormalize(sortSpecs, r);
+ }
+
return ranges.toArray(new TupleRange[ranges.size()]);
}
/**
+ * It normalizes the start and end keys to have the same length bytes if
they are texts or bytes.
+ *
+ * @param sortSpecs The sort specs
+ * @param range Tuple range to be normalize
+ */
+ public static void normalize(final SortSpec [] sortSpecs, TupleRange
range) {
+ // normalize text fields to have same bytes length
+ for (int i = 0; i < sortSpecs.length; i++) {
+ if (sortSpecs[i].getSortKey().getDataType().getType() ==
TajoDataTypes.Type.TEXT) {
+ byte [] startBytes;
+ byte [] endBytes;
+ if (range.getStart().isNull(i)) {
+ startBytes = BigInteger.ZERO.toByteArray();
+ } else {
+ startBytes = range.getStart().getBytes(i);
+ }
+
+ if (range.getEnd().isNull(i)) {
+ endBytes = BigInteger.ZERO.toByteArray();
+ } else {
+ endBytes = range.getEnd().getBytes(i);
+ }
+
+ byte [][] padded = BytesUtils.padBytes(startBytes, endBytes);
+ range.getStart().put(i, DatumFactory.createText(padded[0]));
+ range.getEnd().put(i, DatumFactory.createText(padded[1]));
+ }
+ }
+ }
+
+ /**
+ * Normalized keys have padding values, but it will cause the key
mismatch in pull server.
+ * So, it denormalize the normalized keys again.
+ *
+ * @param sortSpecs The sort specs
+ * @param range Tuple range to be denormalized
+ */
+ public static void denormalize(SortSpec [] sortSpecs, TupleRange range) {
+ for (int i = 0; i < sortSpecs.length; i++) {
+ if (sortSpecs[i].getSortKey().getDataType().getType() ==
TajoDataTypes.Type.TEXT) {
--- End diff --
As commented above, the type seems to be able to be BLOB.
> Range partition should support split of multiple characters.
> ------------------------------------------------------------
>
> Key: TAJO-966
> URL: https://issues.apache.org/jira/browse/TAJO-966
> Project: Tajo
> Issue Type: Improvement
> Components: data shuffle
> Reporter: Hyunsik Choi
> Assignee: Hyunsik Choi
> Fix For: 0.9.0
>
>
> Currently, range partition does not support split of multiple characters. As
> a result, it only consider the first character when Tajo does range
> partitioning against TEXT or VARCHAR fields. This approach sometimes results
> in skewed ranges, and it causes performance degradation.
> We should fix it.
--
This message was sent by Atlassian JIRA
(v6.2#6252)