[CC'ing John Millikin, enumerator's maintainer]

On Mon, Apr 25, 2011 at 7:10 PM, Skirmantas Kligys
<skirmantas.kli...@gmail.com> wrote:
> I expected to be able to do what SAX does in Java, i.e. to avoid loading the
> whole 2 gigabytes into memory.  For warm-up, I wrote an iteratee to count 
> lines
> in the file, and it does load the whole file into memory!  After profiling, I
> see that the problem was Data.Enumerator.Text.utf8, it allocates up to 60
> megabytes when run on a 40 megabyte test file.

It seems to me that this is a bug in enumerator's "strict" fold not
being strict at all =).  The current version 0.4.9.1 of
Data.Enumerator.List.fold is

-- | Consume the entire input stream with a strict left fold, one element
-- at a time.
--
-- Since: 0.4.8
fold :: Monad m => (b -> a -> b) -> b
       -> Iteratee a m b
fold step = continue . loop where
        f = L.foldl' step
        loop acc stream = case stream of
                Chunks [] -> continue (loop acc)
                Chunks xs -> continue (loop (f acc xs))
                EOF -> yield acc EOF

Note that the list fold is strict (f = Data.List.foldl' step),
*however* the acc parameter of loop isn't strict at all!  It just
creates a big, fat thunk with references to all of you input =(.

But the fix is extremely easy, just change the 'Chunks xs' line to

                Chunks xs -> continue (loop $! f acc xs)

Using only your iterLinesWc test with a 105 MiB file (a movie I had
lying around), with enumerator's definition it takes 220 MiB of memory
and 1.3~1.5 seconds according to +RTS -s.  By doing only this very
change above, it takes 2 MiB of memory (100x improvement :P) and
0.8~0.9 seconds.

John Millikin, could you please apply the attached patch? =)

Cheers,

-- 
Felipe.
# Bazaar merge directive format 2 (Bazaar 0.90)
# revision_id: felipe.le...@gmail.com-20110426020019-mad7rofbt62zwt49
# target_branch: http://john-millikin.com/branches/enumerator/0.4/
# testament_sha1: 8888ff7491ebbe8698c6cad156203566aabd5aed
# timestamp: 2011-04-25 23:01:15 -0300
# base_revision_id: jmilli...@gmail.com-20110329181202-\
#   pzq011f39m787xld
# 
# Begin patch
=== modified file 'enumerator/src/list-analogues.anansi'
--- enumerator/src/list-analogues.anansi	2011-03-29 05:24:03 +0000
+++ enumerator/src/list-analogues.anansi	2011-04-26 02:00:19 +0000
@@ -19,7 +19,7 @@
 	f = L.foldl' step
 	loop acc stream = case stream of
 		Chunks [] -> continue (loop acc)
-		Chunks xs -> continue (loop (f acc xs))
+		Chunks xs -> continue (loop $! f acc xs)
 		EOF -> yield acc EOF
 :
 
@@ -29,7 +29,7 @@
       -> Iteratee a m b
 foldM step = continue . loop where
 	f = CM.foldM step
-	
+
 	loop acc stream = acc `seq` case stream of
 		Chunks [] -> continue (loop acc)
 		Chunks xs -> lift (f acc xs) >>= continue . loop
@@ -134,7 +134,7 @@
 concatMapM f = checkDone (continue . step) where
 	step k EOF = yield (Continue k) EOF
 	step k (Chunks xs) = loop k xs
-	
+
 	loop k [] = continue (step k)
 	loop k (x:xs) = do
 		fx <- lift (f x)
@@ -180,7 +180,7 @@
 concatMapM f = checkDone (continue . step) where
 	step k EOF = yield (Continue k) EOF
 	step k (Chunks xs) = loop k (BL.unpack (BL.fromChunks xs))
-	
+
 	loop k [] = continue (step k)
 	loop k (x:xs) = do
 		fx <- lift (f x)
@@ -206,7 +206,7 @@
 concatMapM f = checkDone (continue . step) where
 	step k EOF = yield (Continue k) EOF
 	step k (Chunks xs) = loop k (TL.unpack (TL.fromChunks xs))
-	
+
 	loop k [] = continue (step k)
 	loop k (x:xs) = do
 		fx <- lift (f x)
@@ -222,7 +222,7 @@
 mapAccum f s0 = checkDone (continue . step s0) where
 	step _ k EOF = yield (Continue k) EOF
 	step s k (Chunks xs) = loop s k xs
-	
+
 	loop s k [] = continue (step s k)
 	loop s k (x:xs) = case f s x of
 		(s', ai) -> k (Chunks [ai]) >>==
@@ -233,7 +233,7 @@
 mapAccumM f s0 = checkDone (continue . step s0) where
 	step _ k EOF = yield (Continue k) EOF
 	step s k (Chunks xs) = loop s k xs
-	
+
 	loop s k [] = continue (step s k)
 	loop s k (x:xs) = do
 		(s', ai) <- lift (f s x)
@@ -247,7 +247,7 @@
 mapAccum f s0 = checkDone (continue . step s0) where
 	step _ k EOF = yield (Continue k) EOF
 	step s k (Chunks xs) = loop s k xs
-	
+
 	loop s k [] = continue (step s k)
 	loop s k (x:xs) = case B.uncons x of
 		Nothing -> loop s k xs
@@ -260,7 +260,7 @@
 mapAccumM f s0 = checkDone (continue . step s0) where
 	step _ k EOF = yield (Continue k) EOF
 	step s k (Chunks xs) = loop s k xs
-	
+
 	loop s k [] = continue (step s k)
 	loop s k (x:xs) = case B.uncons x of
 		Nothing -> loop s k xs
@@ -276,7 +276,7 @@
 mapAccum f s0 = checkDone (continue . step s0) where
 	step _ k EOF = yield (Continue k) EOF
 	step s k (Chunks xs) = loop s k xs
-	
+
 	loop s k [] = continue (step s k)
 	loop s k (x:xs) = case T.uncons x of
 		Nothing -> loop s k xs
@@ -289,7 +289,7 @@
 mapAccumM f s0 = checkDone (continue . step s0) where
 	step _ k EOF = yield (Continue k) EOF
 	step s k (Chunks xs) = loop s k xs
-	
+
 	loop s k [] = continue (step s k)
 	loop s k (x:xs) = case T.uncons x of
 		Nothing -> loop s k xs
@@ -521,7 +521,7 @@
 	loop acc n' (Chunks xs) = iter where
 		lazy = BL.fromChunks xs
 		len = toInteger (BL.length lazy)
-		
+
 		iter = if len < n'
 			then continue (loop (acc . (BL.append lazy)) (n' - len))
 			else let
@@ -538,7 +538,7 @@
 	loop acc n' (Chunks xs) = iter where
 		lazy = TL.fromChunks xs
 		len = toInteger (TL.length lazy)
-		
+
 		iter = if len < n'
 			then continue (loop (acc . (TL.append lazy)) (n' - len))
 			else let
@@ -777,7 +777,7 @@
 isolate n step | n <= 0 = return step
 isolate n (Continue k) = continue loop where
 	len = L.genericLength
-	
+
 	loop (Chunks []) = continue loop
 	loop (Chunks xs)
 		| len xs <= n = k (Chunks xs) >>== isolate (n - len xs)
@@ -797,7 +797,7 @@
 	loop (Chunks xs) = iter where
 		lazy = BL.fromChunks xs
 		len = toInteger (BL.length lazy)
-		
+
 		iter = if len <= n
 			then k (Chunks xs) >>== isolate (n - len)
 			else let
@@ -816,7 +816,7 @@
 	loop (Chunks xs) = iter where
 		lazy = TL.fromChunks xs
 		len = toInteger (TL.length lazy)
-		
+
 		iter = if len <= n
 			then k (Chunks xs) >>== isolate (n - len)
 			else let

# Begin bundle
IyBCYXphYXIgcmV2aXNpb24gYnVuZGxlIHY0CiMKQlpoOTFBWSZTWaiJbbcAAgZfgAAwdGP/91sG
AAC////wUAP4o9O0bgcO7UJKCaBNNTbSp+TKPRqaPJIZGI9NRgymhHpNMJqmh6h6JoAAAAwkhAin
ppqn6aEaaT0gPUDTNQ0DGGhoAAANAAAAACSSYkYI1PCTyjJ6mmmhpoAGgzYAGfXJXsnZB45bimdj
S6VkkUnpRaWuhqU27oa10pPTyrz/OnPFgZhLp4CD9Stl1RqxfOM/VIdoZUq71exR1Du9HmzNaz1a
bAzT+eX238Ix8v9942+bQxSbRt9/Gyu+NkY6prEZqnIm6lTToIhbkygX8rRblw3Q1rzSLtyWGF8V
0mr2vbNkQwDtDEmg1C2jyhC/aegoXeLkVmLrM+2pJLVcVhb5lpBIKxSNQgpc5Vm5vxp9GFDCuupy
3mHbNCeHVjYrENhtngx5Ggit00DMzNwtV8htgOqKhXRtwBqe0ha9UXuHVzcd5XpxhOhTTmCno44x
iMjBYvcMEhMWIdb+qwQZUCWV7G7dw4SFhZ8Rsok5gTvICupxcLwkm5uEhBTMLBg4g9IF6hEQ2xDX
RgFRQZTPk1bBU0NSalJfVAYZjEJyidhxY1chLVWZwH41xFQXgZcLhSZfYb3KNEq+gyFgnMXBIxNW
A6hgX4DbsERvuKBhbHayyuOzVOUhONJ7Wi7X+iYnlclIBjQ8MJZUB2aDSinTjaaieEchBRA4XYmT
FzR1+LEeCkE3TQ0ioUq2v5HHK7s1wx1+Hu40DglW9kx08I7sWSu2sDPVw3fiR+2FZJMD95CZKU+b
q+Voq5dQ3D0jprB4R3sNVM9W0ooT9vLQu5l2brinHEWSQT7IYUE3/3XOZFI2YOL9K6K8edhNUpFU
Jsg9d2lKhiboerXSZ4OSdPz/zfLyXHO+8720ThVw40ZeO1kk2/vpYyr1afDkpT9Ji5Uxsjnfgu3r
YlAlpBimd4WE7Ll2eDjgJ7zT8nmMt6aRG67e5ih2+LdEfCR2LkXBfDrsED6GEhLOYFTDjyTRyNYS
pHOtYQLLANHJ91j6wnA2gymkd7xC84EpoyxC0L/2RIwCkKXBh/KCv4JRCIGWTVXZg3SBhmKBG5Ns
FxSMlLmwmLA6M8YHoRefTOBt5FXMDwe1M71aQZQZMzLbUlWqJNgKAcHQuVnORee9C4WWlmMLREAP
vo+pUwrUqB2prk55EOyRTlcVTeQoxnMOXxaFrLakEYc2m1/tWwHsqd9kEGYlDQNXfhTQ7eK4UX6S
KarTOYtLZjUTTBBy0wPNllnpHJbB/G3EhXh0MOBz7hOsyiy2Mc4tdmRdrURf4u5IpwoSFREttuA=
_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe

Reply via email to