Perhaps the optimizer unrolls the inner loop, and thus can skip safety checks. Naively, seems trickier to do for the iterator.
> El ene. 4, 2017, a las 9:10 PM, Jens Persson via swift-users > <swift-users@swift.org> escribió: > > I noticed disabling safety checks made the custom Iterator as fast as the two > nested for-loops. > Karl, how does your test change when disabling safety checks? > > Anyone having an idea why disabling safety checks should make the two sum > funcs equally fast in my example program? > > (It shouldn't be the preconditions of Table2D<>'s subscripts, unless the > optimizer (with safety checks) can remove them in the case of the nested > for-loops but not in the case of the custom iterator.) > > /Jens > > >> On Wed, Jan 4, 2017 at 6:59 PM, Jens Persson <j...@bitcycle.com> wrote: >> Thanks, I wonder if it is currently impossible to make it as fast as the >> nested for loops, ie that some optimizer improvement could fix it. >> Here's a stripped down version of my code: >> >> >> import QuartzCore // This is just for timing using CACurrentMediaTime() >> >> >> struct Point2DInt { >> var x: Int >> var y: Int >> } >> >> struct Size2DInt { >> var width: Int >> var height: Int >> var rect: Rect2DInt { return Rect2DInt(x: 0, y: 0, width: width, height: >> height) } >> } >> >> struct Rect2DInt : Sequence { >> var description: String { return "(\(origin), \(size))" } >> var origin: Point2DInt >> var size: Size2DInt >> var minX: Int { >> get { return origin.x } >> set { size.width = maxX - newValue; origin.x = newValue } >> } >> var maxX: Int { >> get { return origin.x + size.width } >> set { size.width = newValue - minX } >> } >> var minY: Int { >> get { return origin.y } >> set { size.height = maxY - newValue; origin.y = newValue } >> } >> var maxY: Int { >> get { return origin.y + size.height } >> set { size.height = newValue - minY } >> } >> init(origin: Point2DInt, size: Size2DInt) { >> self.origin = origin >> self.size = size >> } >> init(x: Int, y: Int, width: Int, height: Int) { >> self.init(origin: Point2DInt(x: x, y: y), size: Size2DInt(width: >> width, height: height)) >> } >> init(minX: Int, minY: Int, maxX: Int, maxY: Int) { >> self.init(origin: Point2DInt(x: minX, y: minY), >> size: Size2DInt(width: maxX - minX, height: maxY - minY)) >> } >> func makeIterator() -> Rect2DIntPointIterator { >> return Rect2DIntPointIterator(rect: self) >> } >> } >> >> >> // This is the crucial type here, this version is the fastest that >> // I could find, but it's still slower than two nested for loops, >> // see test below. >> struct Rect2DIntPointIterator : IteratorProtocol, Sequence { >> let startX, startY, stopX, stopY: Int >> var currentPoint: Point2DInt >> init(rect: Rect2DInt) { >> currentPoint = rect.origin >> startX = rect.origin.x >> startY = rect.origin.y >> stopX = rect.maxX >> stopY = rect.maxY >> } >> mutating func next() -> Point2DInt? { >> defer { currentPoint.x = currentPoint.x &+ 1 } >> if currentPoint.x == stopX { >> currentPoint.x = startX >> currentPoint.y = currentPoint.y &+ 1 >> if currentPoint.y == stopY { return nil } >> } >> return currentPoint >> } >> } >> >> >> struct Table2D<Element> { >> let size: Size2DInt >> var storage: [Element] >> init(size: Size2DInt, filledWith element: Element) { >> precondition(size.width > 0 && size.height > 0) >> self.size = size >> self.storage = [Element](repeating: element, count: size.width * >> size.height) >> } >> subscript(x: Int, y: Int) -> Element { >> get { >> precondition(x >= 0 && y >= 0 && x < size.width && y < >> size.height) >> return storage[x + y * size.width] >> } >> set { >> precondition(x >= 0 && y >= 0 && x < size.width && y < >> size.height) >> storage[x + y * size.width] = newValue >> } >> } >> subscript(position: Point2DInt) -> Element { >> get { return self[position.x, position.y] } >> set { self[position.x, position.y] = newValue } >> } >> } >> >> func randomDouble() -> Double { >> // Returns a random Double in the range [0, 1) >> let ui64 = (UInt64(arc4random()) << 32) | UInt64(arc4random()) >> return Double(ui64 >> UInt64(63 - Double.significandBitCount)) * >> .ulpOfOne/2 >> } >> func randomInt(from: Int, to: Int) -> Int { >> // Returns an Int in the range [from, to) >> return Int(Double(from) + (randomDouble() * Double(to - >> from)).rounded(.down)) >> } >> >> func randomSubrects(of rect: Rect2DInt, minSize: Size2DInt, count: Int) -> >> [Rect2DInt] { >> precondition(count > 0 && minSize.width > 0 && minSize.height > 0) >> var subrects = [Rect2DInt]() >> subrects.reserveCapacity(count) >> for _ in 0 ..< count { >> let size = Size2DInt(width: randomInt(from: minSize.width, to: >> rect.size.width), >> height: randomInt(from: minSize.height, to: >> rect.size.height)) >> let origin = Point2DInt(x: randomInt(from: 0, to: rect.size.width - >> size.width), >> y: randomInt(from: 0, to: rect.size.height - >> size.height)) >> subrects.append(Rect2DInt(origin: origin, size: size)) >> } >> return subrects >> } >> >> >> func randomTable(size: Size2DInt) -> Table2D<Double> { >> var table = Table2D(size: size, filledWith: 0.0) >> for p in table.size.rect { table[p] = randomDouble() } >> return table >> } >> >> >> func sum1(areas: [Rect2DInt], of table: Table2D<Double>) -> Double { >> var sum = 0.0 >> for r in areas { >> // Using custom iterator: >> for p in r { sum += table[p] } >> } >> return sum >> } >> >> func sum2(areas: [Rect2DInt], of table: Table2D<Double>) -> Double { >> var sum = 0.0 >> for r in areas { >> // Using two nested for loops: >> for y in r.minY ..< r.maxY { >> for x in r.minX ..< r.maxX { >> sum = sum + table[x, y] >> } >> } >> } >> return sum >> } >> >> func test( >> sumFn: ([Rect2DInt], Table2D<Double>) -> Double, >> label: String, >> table: Table2D<Double>, >> areas: [Rect2DInt] >> ) >> { >> let t0 = CACurrentMediaTime() >> let sum = sumFn(areas, table) >> let t1 = CACurrentMediaTime() >> print(label, t1 - t0, "seconds (sum \(sum))") >> } >> >> for _ in 0 ..< 4 { >> let rndTable = randomTable(size: Size2DInt(width: 1000, height: 1000)) >> let rndTableAreas = randomSubrects(of: rndTable.size.rect, >> minSize: Size2DInt(width: 100, >> height: 100), >> count: 1000) >> test(sumFn: sum1, label: "sum1 - using custom iterator ", table: >> rndTable, areas: rndTableAreas) >> test(sumFn: sum2, label: "sum2 - using nested for-loops ", table: >> rndTable, areas: rndTableAreas) >> print() >> } >> >> // >> // Typical output on my MacBook Pro (Retina, 15-inch, Late 2013): >> // >> // sum1 - using custom iterator 0.480134483019356 seconds (sum >> 153408603.850653) >> // sum2 - using nested for-loops 0.348341046017595 seconds (sum >> 153408603.850653) >> // >> // sum1 - using custom iterator 0.426998238021042 seconds (sum >> 149851816.622638) >> // sum2 - using nested for-loops 0.34111139801098 seconds (sum >> 149851816.622638) >> // >> // sum1 - using custom iterator 0.466021075990284 seconds (sum >> 155267702.297466) >> // sum2 - using nested for-loops 0.351970263000112 seconds (sum >> 155267702.297466) >> // >> // sum1 - using custom iterator 0.426723245007452 seconds (sum >> 146331850.202214) >> // sum2 - using nested for-loops 0.340267747989856 seconds (sum >> 146331850.202214) >> // >> >> >>> On Wed, Jan 4, 2017 at 12:42 PM, Karl <razie...@gmail.com> wrote: >>> Hmmm that’s interesting. A brief test I ran: >>> >>> import CoreGraphics >>> import Foundation >>> >>> struct PointIterator { >>> let rect: CGRect >>> var nextPoint: CGPoint >>> >>> let maxX: CGFloat >>> let maxY: CGFloat >>> >>> init(rect: CGRect) { >>> self.rect = rect.standardized >>> self.nextPoint = self.rect.origin >>> // Cache for fast iteration >>> self.maxX = self.rect.maxX >>> self.maxY = self.rect.maxY >>> } >>> >>> mutating func next() -> CGPoint? { >>> guard nextPoint.x <= maxX, nextPoint.y <= maxY else { >>> return .none >>> } >>> defer { >>> nextPoint.x += 1 >>> if nextPoint.x > maxX { >>> nextPoint.x = rect.origin.x >>> nextPoint.y += 1 >>> } >>> } >>> return nextPoint >>> } >>> } >>> >>> // Use iterator >>> func iteratePoints_it(_ rect: CGRect, with: (CGPoint)->()) { >>> var it = PointIterator(rect: rect) >>> while let point = it.next() { >>> with(point) >>> } >>> } >>> >>> // Basic unwrapping of the iterator as a function (no ‘defer’) >>> func iteratePoints_fe(_ rect: CGRect, with: (CGPoint)->()) { >>> let rect = rect.standardized >>> var nextPoint = rect.origin >>> let maxX = rect.maxX >>> let maxY = rect.maxY >>> >>> while true { >>> guard nextPoint.x <= maxX, nextPoint.y <= maxY else { >>> return >>> } >>> with(nextPoint) >>> nextPoint.x += 1 >>> if nextPoint.x > maxX { >>> nextPoint.x = rect.origin.x >>> nextPoint.y += 1 >>> } >>> } >>> } >>> >>> // for..in loop >>> func iteratePoints_fe2(_ rect: CGRect, with: (CGPoint)->()) { >>> let rect = rect.standardized >>> let maxX = rect.maxX >>> let maxY = rect.maxY >>> for y in stride(from: rect.origin.y, to: maxY, by: 1) { >>> for x in stride(from: rect.origin.x, to: maxX, by: 1) { >>> with(CGPoint(x: x, y: y)) >>> } >>> } >>> } >>> >>> func profile(_ iterations: Int, _ thing: ()->()) -> TimeInterval { >>> var totalTime: TimeInterval = 0 >>> for _ in 0..<iterations { >>> let start = Date().timeIntervalSinceReferenceDate >>> thing() >>> totalTime += (Date().timeIntervalSinceReferenceDate - start) >>> } >>> return totalTime/TimeInterval(iterations) >>> } >>> >>> >>> let bigRect = CGRect(x: 0, y: 0, width: 10_000, height: 10_000) >>> >>> let iterator = profile(10) { iteratePoints_it(bigRect) { if $0.x > >>> 1_000_000 { print("?") } } } // always false, won't be optimised out. >>> let foreach = profile(10) { iteratePoints_fe(bigRect) { if $0.x > >>> 1_000_000 { print("?") } } } >>> let foreach2 = profile(10) { iteratePoints_fe2(bigRect) { if $0.x > >>> 1_000_000 { print("?") } } } >>> print("iterator: \(iterator) \nforeach: \(foreach) \nforeach2: \(foreach2)") >>> >>> Results: >>> >>> iterator: 0.316907703876495 >>> foreach: 0.283202117681503 >>> foreach2: 0.568318998813629 >>> >>> That ranking is consistent, too. Using an iterator does appear marginally >>> slower than a basic unwrapping of the iterator in to a function. >>> >>>> On 4 Jan 2017, at 09:56, Jens Persson via swift-users >>>> <swift-users@swift.org> wrote: >>>> >>>> Hi, >>>> >>>> I'm working on some low-level pixel processing code (stuff that is not >>>> possible to do using standard API or on eg GPU), and I had lots of eg >>>> these: >>>> >>>> for y in someStartY ..< someStopY { >>>> for x in someStartX ..< someStopX { >>>> ... pixels[x, y] ... >>>> } >>>> } >>>> >>>> So I implemented some (value) types like eg IntPoint2D, IntSize2D, >>>> IntRect2D and I made an IntRect2DIterator so that IntRect2D could be a >>>> Sequence over its (discrete) points. With this I could rewrite the above >>>> like so: >>>> >>>> for pixelPosAsIntPoint2D in someIntRect2D { >>>> ... pixels[pixelPosAsIntPoint2D] ... >>>> } >>>> >>>> For some reason the first version (two nested for loops for x and y) is >>>> always a bit faster than the abstracted version no matter how I write it >>>> (tried using eg &+ instead of + etc). >>>> >>>> >>>> >>>> Is it possible to write as a zero cost abstraction like this, if so, how? >>>> If not, why? >>>> >>>> >>>> >>>> >>>> PS >>>> >>>> Note that eg this: >>>> >>>> for y in someStartY ..< someStopY { >>>> for x in someStartX ..< someStopX { >>>> let pixelPosAsIntPoint2D = IntPoint2D(x: x, y: y) >>>> ... pixels[pixelPosAsIntPoint2D] ... >>>> } >>>> } >>>> >>>> is exactly as fast as the top example (using just ... pixels[x, y] ...). >>>> So the difference in execution time seems to be due to something in the >>>> Iterator and not eg the pixel accessing subscript taking the 2d int point >>>> type instead of separate x and y ints. >>>> >>>> Here is one Iterator variant that I have tested: >>>> >>>> struct Rect2DIntPointIterator : IteratorProtocol, Sequence { >>>> let startX, startY, stopX, stopY: Int >>>> var px, py: Int >>>> init(rect: Rect2DInt) { >>>> startX = rect.origin.x >>>> startY = rect.origin.y >>>> stopX = rect.maxX >>>> stopY = rect.maxY >>>> px = startX >>>> py = startY >>>> } >>>> mutating func next() -> Point2DInt? { >>>> defer { px = px &+ 1 } >>>> if px == stopX { >>>> px = startX >>>> py = py &+ 1 >>>> if py == stopY { return nil } >>>> } >>>> return Point2DInt(x: px, y: py) >>>> } >>>> } >>>> >>>> >>>> And here are typical execution times from an example test: >>>> 2.1 seconds using my Iterator (the fastest I can get it, no matter how I >>>> try to rewrite it). >>>> 1.5 seconds using nested x, y for loops. >>>> >>>> I'm pretty sure my testing is done thoroughly (measuring average of many >>>> runs, using random data, avoiding dead code elimination, whole module >>>> optimization, etc). >>>> >>>> I have tried profiling the code and looking at the disassmbly but I'm >>>> failing to understand what's going on. >>>> >>>> So the ultimate answer would be in the form of a (2d, Int) Rectangle type >>>> whose (2d, Int) Points can be iterated in a for loop, at zero cost >>>> compared to doing the same using two nested for loops. Or an explanation >>>> of why this is impossible. >>>> >>>> DS >>>> >>>> /Jens >>>> >>>> _______________________________________________ >>>> swift-users mailing list >>>> swift-users@swift.org >>>> https://lists.swift.org/mailman/listinfo/swift-users >>> >> > > _______________________________________________ > swift-users mailing list > swift-users@swift.org > https://lists.swift.org/mailman/listinfo/swift-users
_______________________________________________ swift-users mailing list swift-users@swift.org https://lists.swift.org/mailman/listinfo/swift-users