From 598ce1e155d4fb02af0991374730d5f95624a65d Mon Sep 17 00:00:00 2001
From: Piotr Piastucki <ppiastucki@taleo.com>
Date: Mon, 30 Apr 2012 11:34:13 +0200
Subject: [PATCH] Inline highlighting performance improvements Although
 MyersSequenceMatcher includes a couple of performance
 optimizations in the pre-processing phase none of them is
 efficient enough when dealing with character-based
 matching. This patch adds a new matcher class called
 InlineMyersSequenceMatcher which improves provides
 additional optimization based on the assumption that a
 single character-based chunk must be at least 3 characters
 long.

---
 meld/filediff.py |    2 +-
 meld/matchers.py |  114 ++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 95 insertions(+), 21 deletions(-)

diff --git a/meld/filediff.py b/meld/filediff.py
index de879a3..d50e427 100644
--- a/meld/filediff.py
+++ b/meld/filediff.py
@@ -58,7 +58,7 @@ class CachedSequenceMatcher(object):
             self.cache[(text1, textn)][1] = time.time()
             return self.cache[(text1, textn)][0]
         except KeyError:
-            matcher = matchers.MyersSequenceMatcher(None, text1, textn)
+            matcher = matchers.InlineMyersSequenceMatcher(None, text1, textn)
             opcodes = matcher.get_opcodes()
             self.cache[(text1, textn)] = [opcodes, time.time()]
             return opcodes
diff --git a/meld/matchers.py b/meld/matchers.py
index f92d743..d0371c3 100644
--- a/meld/matchers.py
+++ b/meld/matchers.py
@@ -74,36 +74,27 @@ class MyersSequenceMatcher(difflib.SequenceMatcher):
     def get_difference_opcodes(self):
         return filter(lambda x: x[0] != "equal", self.get_opcodes())
 
-    def preprocess(self):
-        """
-        Pre-processing optimizations:
-        1) remove common prefix and common suffix
-        2) remove lines that do not match
-        """
-        a = self.a
-        b = self.b
-        aindex = self.aindex = {}
-        bindex = self.bindex = {}
-        n = len(a)
-        m = len(b)
+    def preprocess_remove_prefix_suffix(self, a, b):
         # remove common prefix and common suffix
         self.common_prefix = self.common_suffix = 0
         self.common_prefix = find_common_prefix(a, b)
         if self.common_prefix > 0:
             a = a[self.common_prefix:]
             b = b[self.common_prefix:]
-            n -= self.common_prefix
-            m -= self.common_prefix
 
-        if n > 0 and m > 0:
+        if len(a) > 0 and len(b) > 0:
             self.common_suffix = find_common_suffix(a, b)
             if self.common_suffix > 0:
-                a = a[:n - self.common_suffix]
-                b = b[:m - self.common_suffix]
-                n -= self.common_suffix
-                m -= self.common_suffix
-
+                a = a[:len(a) - self.common_suffix]
+                b = b[:len(b) - self.common_suffix]
+        return (a, b)
+    
+    def preprocess_discard_nonmatching_lines(self, a, b):
         # discard lines that do not match any line from the other file
+        aindex = self.aindex = {}
+        bindex = self.bindex = {}
+        n = len(a)
+        m = len(b)
         if n > 0 and m > 0:
             aset = frozenset(a)
             bset = frozenset(b)
@@ -129,6 +120,15 @@ class MyersSequenceMatcher(difflib.SequenceMatcher):
                 b = b2
         return (a, b)
 
+    def preprocess(self):
+        """
+        Pre-processing optimizations:
+        1) remove common prefix and common suffix
+        2) remove lines that do not match
+        """
+        a, b = self.preprocess_remove_prefix_suffix(self.a, self.b)
+        return self.preprocess_discard_nonmatching_lines(a, b)
+
     def postprocess(self):
         mb = [self.matching_blocks[-1]]
         i = len(self.matching_blocks) - 2
@@ -288,3 +288,77 @@ class MyersSequenceMatcher(difflib.SequenceMatcher):
         self.build_matching_blocks(lastsnake, snakes)
         self.postprocess()
         yield 1
+
+class InlineMyersSequenceMatcher(difflib.SequenceMatcher):
+    
+    def preprocess_discard_nonmatching_lines(self, a, b):
+        aindex = self.aindex = {}
+        bindex = self.bindex = {}
+        n = len(a)
+        m = len(b)
+        if m > 2 and n > 2:
+            a2 = []
+            b2 = []
+            aset = set()
+            bset = set()
+            for i in range(n - 2):
+                aset.add((a[i], a[i+1], a[i+2]))
+            for i in range(m - 2):
+                bset.add((b[i], b[i+1], b[i+2]))
+            j = 0
+            c1 = None
+            c2 = None
+            matched1 = False
+            matched2 = False
+            for i, c3 in enumerate(b):
+                if (c1, c2, c3) in aset:
+                    if not matched1:
+                        b2.append(c1)
+                        bindex[j] = i - 2
+                        j += 1
+                    if not matched2:
+                        b2.append(c2)
+                        bindex[j] = i - 1
+                        j += 1
+                    b2.append(c3)
+                    bindex[j] = i
+                    j += 1
+                    matched1 = matched2 = True
+                else:
+                    matched1 = matched2
+                    matched2 = False
+                c1 = c2
+                c2 = c3
+
+            k = 0
+            c1 = None
+            c2 = None
+            matched1 = False
+            matched2 = False
+            for i, c3 in enumerate(a):
+                if (c1, c2, c3) in bset:
+                    if not matched1:
+                        a2.append(c1)
+                        aindex[k] = i - 2
+                        k += 1
+                    if not matched2:
+                        a2.append(c2)
+                        aindex[k] = i - 1
+                        k += 1
+                    a2.append(c3)
+                    aindex[k] = i
+                    k += 1
+                    matched1 = matched2 = True
+                else:
+                    matched1 = matched2
+                    matched2 = False
+                c1 = c2
+                c2 = c3
+            # We only use the optimised result if it's worthwhile. The constant
+            # represents a heuristic of how many lines constitute 'worthwhile'.
+            self.lines_discarded = m - j > 10 or n - k > 10
+            if self.lines_discarded:
+                a = a2
+                b = b2
+        return (a, b)
+
-- 
1.7.9.5

