Github user sryza commented on a diff in the pull request:
https://github.com/apache/spark/pull/1977#discussion_r20132241
--- Diff: python/pyspark/shuffle.py ---
@@ -520,6 +505,295 @@ def sorted(self, iterator, key=None, reverse=False):
return heapq.merge(chunks, key=key, reverse=reverse)
+class ExternalList(object):
+ """
+ ExternalList can have many items which cannot be hold in memory in
+ the same time.
+
+ >>> l = ExternalList(range(100))
+ >>> len(l)
+ 100
+ >>> l.append(10)
+ >>> len(l)
+ 101
+ >>> for i in range(10240):
+ ... l.append(i)
+ >>> len(l)
+ 10341
+ >>> import pickle
+ >>> l2 = pickle.loads(pickle.dumps(l))
+ >>> len(l2)
+ 10341
+ >>> list(l2)[100]
+ 10
+ """
+ LIMIT = 10240
+
+ def __init__(self, values):
+ self.values = values
+ self.disk_count = 0
+ self._file = None
+ self._ser = None
+
+ def __getstate__(self):
+ if self._file is not None:
+ self._file.flush()
+ f = os.fdopen(os.dup(self._file.fileno()))
+ f.seek(0)
+ bytes = f.read()
+ else:
+ bytes = ''
+ return self.values, self.disk_count, bytes
+
+ def __setstate__(self, item):
+ self.values, self.disk_count, bytes = item
+ if bytes:
+ self._open_file()
+ self._file.write(bytes)
+ else:
+ self._file = None
+ self._ser = None
+
+ def __iter__(self):
+ if self._file is not None:
+ self._file.flush()
+ # read all items from disks first
+ with os.fdopen(os.dup(self._file.fileno()), 'r') as f:
+ f.seek(0)
+ for values in self._ser.load_stream(f):
+ for v in values:
+ yield v
+
+ for v in self.values:
+ yield v
+
+ def __len__(self):
+ return self.disk_count + len(self.values)
+
+ def append(self, value):
+ self.values.append(value)
+ # dump them into disk if the key is huge
+ if len(self.values) >= self.LIMIT:
+ self._spill()
+
+ def _open_file(self):
+ dirs = _get_local_dirs("objects")
+ d = dirs[id(self) % len(dirs)]
+ if not os.path.exists(d):
+ os.makedirs(d)
+ p = os.path.join(d, str(id))
+ self._file = open(p, "w+", 65536)
+ self._ser = CompressedSerializer(PickleSerializer())
+ os.unlink(p)
+
+ def _spill(self):
+ """ dump the values into disk """
+ global MemoryBytesSpilled, DiskBytesSpilled
+ if self._file is None:
+ self._open_file()
+
+ used_memory = get_used_memory()
+ pos = self._file.tell()
+ self._ser.dump_stream([self.values], self._file)
+ self.disk_count += len(self.values)
+ self.values = []
+ gc.collect()
+ DiskBytesSpilled += self._file.tell() - pos
+ MemoryBytesSpilled += (used_memory - get_used_memory()) << 20
+
+
+class GroupByKey(object):
+ """
+ group a sorted iterator into [(k1, it1), (k2, it2), ...]
+
+ >>> k = [i/3 for i in range(6)]
+ >>> v = [i for i in range(6)]
+ >>> g = GroupByKey(iter(zip(k, v)))
+ >>> [(k, list(it)) for k, it in g]
+ [(0, [0, 1, 2]), (1, [3, 4, 5])]
+ """
+ def __init__(self, iterator):
+ self.iterator = iterator
+ self.next_item = None
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ key, value = self.next_item if self.next_item else
next(self.iterator)
+ values = ExternalList([value])
+ try:
+ while True:
+ k, v = next(self.iterator)
+ if k != key:
+ self.next_item = (k, v)
+ break
+ values.append(v)
+ except StopIteration:
+ self.next_item = None
+ return key, values
+
+
+class ChainedIterable(object):
+ """
+ Picklable chained iterator, similar to itertools.chain.fromiterable()
+ """
+ def __init__(self, iterators):
+ self.iterators = iterators
+
+ def __len__(self):
+ return sum(len(vs) for vs in self.iterators)
+
+ def __iter__(self):
+ return itertools.chain.fromiterable(self.iterators)
+
+
+class ExternalGroupBy(ExternalMerger):
+
+ """
+ Group by the items by key. If any partition of them can not been
+ hold in memory, it will do sort based group by.
+
+ This class works as follows:
+
+ - It repeatedly group the items by key and save them in one dict in
+ memory.
+
+ - When the used memory goes above memory limit, it will split
+ the combined data into partitions by hash code, dump them
+ into disk, one file per partition. If the number of keys
+ in one partitions is smaller than 1000, it will sort them
+ by key before dumping into disk.
+
+ - Then it goes through the rest of the iterator, group items
+ by key into different dict by hash. Until the used memory goes over
+ memory limit, it dump all the dicts into disks, one file per
+ dict. Repeat this again until combine all the items. It
+ also will try to sort the items by key in each partition
+ before dumping into disks.
+
+ - It will yield the grouped items partitions by partitions.
+ If the data in one partitions can be hold in memory, then it
+ will load and combine them in memory and yield.
+
+ - If the dataset in one partittion cannot be hold in memory,
--- End diff --
nit: partition
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]