On May 22, 2009, at 12:31 PM, Andrea Gavana wrote: > Hi All, > > this should be a very easy question but I am trying to make a > script run as fast as possible, so please bear with me if the solution > is easy and I just overlooked it. > > I have a list of integers, like this one: > > indices = [1,2,3,4,5,6,7,8,9,255,256,257,258,10001,10002,10003,10004] > >> From this list, I would like to find out which values are consecutive > and store them in another list of tuples (begin_consecutive, > end_consecutive) or a simple list: as an example, the previous list > will become: > > new_list = [(1, 9), (255, 258), (10001, 10004)]
Josef's and Chris's solutions are pretty neat in this case. I've been recently working on a more generic case where integers are grouped depending on some condition (equals, differing by 1 or 2...). A version in pure Python/numpy, the `Cluster` class is available in scikits.hydroclimpy.core.tools (hydroclimpy.sourceforge.net). Otherwise, here's a Cython version of the same class. Let me know if it works. And I'm not ultra happy with the name, so if you have any suggestions... cdef class Brackets: """ Groups consecutive data from an array according to a clustering condition. A cluster is defined as a group of consecutive values differing by at most the increment value. Missing values are **not** handled: the input sequence must therefore be free of missing values. Parameters ---------- darray : ndarray Input data array to clusterize. increment : {float}, optional Increment between two consecutive values to group. By default, use a value of 1. operator : {function}, optional Comparison operator for the definition of clusters. By default, use :func:`numpy.less_equal`. Attributes ---------- inishape Shape of the argument array (stored for resizing). inisize Size of the argument array. uniques : sequence List of unique cluster values, as they appear in chronological order. slices : sequence List of the slices corresponding to each cluster of data. starts : ndarray Lists of the indices at which the clusters start. ends : ndarray Lists of the indices at which the clusters end. clustered : list List of clustered data. Examples -------- >>> A = [0, 0, 1, 2, 2, 2, 3, 4, 3, 4, 4, 4] >>> klust = cluster(A,0) >>> [list(_) for _ in klust.clustered] [[0, 0], [1], [2, 2, 2], [3], [4], [3], [4, 4, 4]] >>> klust.uniques array([0, 1, 2, 3, 4, 3, 4]) >>> x = [ 1.8, 1.3, 2.4, 1.2, 2.5, 3.9, 1. , 3.8, 4.2, 3.3, ... 1.2, 0.2, 0.9, 2.7, 2.4, 2.8, 2.7, 4.7, 4.2, 0.4] >>> Brackets(x,1).starts array([ 0, 2, 3, 4, 5, 6, 7, 10, 11, 13, 17, 19]) >>> Brackets(x,1.5).starts array([ 0, 6, 7, 10, 13, 17, 19]) >>> Brackets(x,2.5).starts array([ 0, 6, 7, 19]) >>> Brackets(x,2.5,greater).starts array([ 0, 1, 2, 3, 4, 5, 8, 9, 10, ... 11, 12, 13, 14, 15, 16, 17, 18]) >>> y = [ 0, -1, 0, 0, 0, 1, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0] >>> Brackets(y,1).starts array([ 0, 1, 2, 5, 7, 10, 12, 16, 18]) """ cdef readonly double increment cdef readonly np.ndarray data cdef readonly list _starts cdef readonly list _ends def __init__(Brackets self, object data, double increment=1, object operator=np.less_equal): """ """ cdef int i, n, ifirst, ilast, test cdef double last cdef list starts, ends # self.increment = increment self.data = np.asanyarray(data) data = np.asarray(data) # n = len(data) starts = [] ends = [] # last = data[0] ifirst = 0 ilast = 0 for 1 <= i < n: test = operator(abs(data[i] - last), increment) ilast = i if not test: starts.append(ifirst) ends.append(ilast-1) ifirst = i last = data[i] starts.append(ifirst) ends.append(n-1) self._starts = starts self._ends = ends def __len__(self): return len(self.starts) property starts: # def __get__(Brackets self): return np.asarray(self._starts) property ends: # def __get__(Brackets self): return np.asarray(self._ends) property sizes: # def __get__(Brackets self): return np.asarray(self._ends) - np.asarray(self._firsts) property slices: # def __get__(Brackets self): cdef int i cdef list starts = self._starts, ends = self._ends cdef list slices = [] for 0 <= i < len(starts): slices.append(slice(starts[i], ends[i]+1)) return slices property clustered: # def __get__(self): cdef int i cdef list starts = self._starts, ends = self._ends cdef list groups = [] cdef np.ndarray data = self.data for 0 <= i < len(starts): groups.append(data[starts[i]:ends[i]+1]) return groups property uniques: def __get__(self): return self.data[self.starts] def grouped_slices(self): """ Returns a dictionary with the unique values of ``self`` as keys, and a list of slices for the corresponding values. See Also -------- Brackets.grouped_limits that does the same thing """ # Define shortcuts cdef int i, ifirst, n = len(self) cdef list starts = self._starts, ends = self._ends cdef np.ndarray data = self.data # Define new variables cdef list seen = [] cdef double value cdef dict grouped = {} for 0 <= i < n: ifirst = starts[i] value = data[ifirst] if not (value in seen): grouped[value] = [] seen.append(value) grouped[value].append(slice(ifirst, ends[i]+1)) return grouped def grouped_limits(self): """ Returns a dictionary with the unique values of ``self`` as keys, and a list of tuples (starting index, ending index) for the corresponding values. See Also -------- Cluster.grouped_slices """ # Define shortcuts cdef int i, ifirst, n = len(self) cdef list starts = self.starts, ends = self.ends cdef np.ndarray data = self.data # Define new variables cdef list seen = [] cdef double value cdef dict grouped = {} for 0 <= i < n: ifirst = starts[i] value = data[ifirst] if not (value in seen): grouped[value] = [] seen.append(value) grouped[value].append((ifirst, ends[i]+1)) for k in grouped: grouped[k] = np.asarray(grouped[k]) return grouped _______________________________________________ Numpy-discussion mailing list Numpy-discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion