Bug#848049: diffoscope: Add detection of order-only differences in plain text formats

2017-01-12 Thread Maria Glukhova
On Sun, 25 Dec 2016 15:28:52 +0100 Jérémy Bobbio  wrote:

Hi Lunar!

> You would not have to read the file twice as long as you do the hash
> in the difference module, when each line is actually fed to diff.
> A similar trick is already used to cope with files that are too long,
> see diffoscope.difference.make_feeder_from_raw_reader()
>

I implemented what I believe was your idea in the attached patch. Thank you
for pointing me to it!
Still, I don't think that feature worth invading into diff.py/diffoscope.py
modules. It doesn't speed up comparison significantly, because call to diff
still takes most of the time on big files with difference only in line
order. Besides, I can't think of many examples of where that feature would
be needed, save from text files.

In any case, thank you again for taking time to provide me with that idea!


Maria
From 7abab29f878f7fe8e1229995c06a404d5d4e0b40 Mon Sep 17 00:00:00 2001
From: Maria Glukhova 
Date: Sat, 7 Jan 2017 18:42:19 +0300
Subject: [PATCH] Generic order-line difference for all kind of inputs.

---
 diffoscope/comparators/text.py | 13 -
 diffoscope/diff.py | 27 ---
 diffoscope/difference.py   | 24 
 3 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/diffoscope/comparators/text.py b/diffoscope/comparators/text.py
index fc2f446..ea488c3 100644
--- a/diffoscope/comparators/text.py
+++ b/diffoscope/comparators/text.py
@@ -25,16 +25,6 @@ from diffoscope.difference import Difference
 from .utils.file import File
 
 
-def order_only_difference(unified_diff):
-diff_lines = unified_diff.splitlines()
-added_lines = [line[1:] for line in diff_lines if line.startswith('+')]
-removed_lines = [line[1:] for line in diff_lines if line.startswith('-')]
-# Faster check: does number of lines match?
-if len(added_lines) != len(removed_lines):
-return False
-return sorted(added_lines) == sorted(removed_lines) and added_lines != removed_lines
-
-
 class TextFile(File):
 RE_FILE_TYPE = re.compile(r'\btext\b')
 
@@ -55,9 +45,6 @@ class TextFile(File):
 with codecs.open(self.path, 'r', encoding=my_encoding) as my_content, \
  codecs.open(other.path, 'r', encoding=other_encoding) as other_content:
 difference = Difference.from_text_readers(my_content, other_content, self.name, other.name, source)
-# Check if difference is only in line order.
-if difference and order_only_difference(difference.unified_diff):
-difference.add_comment("ordering differences only")
 if my_encoding != other_encoding:
 if difference is None:
 difference = Difference(None, self.path, other.path, source)
diff --git a/diffoscope/diff.py b/diffoscope/diff.py
index 011916a..9783b27 100644
--- a/diffoscope/diff.py
+++ b/diffoscope/diff.py
@@ -190,12 +190,13 @@ def run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2):
 
 return parser.diff
 
-def feed(feeder, f, end_nl_q):
+def feed(feeder, f, end_nl_q, order_hash_q):
 # work-around unified diff limitation: if there's no newlines in both
 # don't make it a difference
 try:
-end_nl = feeder(f)
+end_nl, h_order = feeder(f)
 end_nl_q.put(end_nl)
+order_hash_q.put(h_order)
 finally:
 f.close()
 
@@ -227,9 +228,9 @@ class ExThread(threading.Thread):
 raise ex
 
 @contextlib.contextmanager
-def fd_from_feeder(feeder, end_nl_q, fifo):
+def fd_from_feeder(feeder, end_nl_q, order_hash_q, fifo):
 f = open(fifo, 'wb')
-t = ExThread(target=feed, args=(feeder, f, end_nl_q))
+t = ExThread(target=feed, args=(feeder, f, end_nl_q, order_hash_q))
 
 t.daemon = True
 t.start()
@@ -272,16 +273,28 @@ def make_feeder_from_raw_reader(in_file, filter=lambda buf: buf):
 return end_nl
 return feeder
 
+def compare_order_hashes(order_hash_q1, order_hash_q2):
+# Check if the only difference is in line numbers by comparing sums of hashes.
+order_hash1 = order_hash_q1.get()
+order_hash2 = order_hash_q2.get()
+if order_hash1 is None or order_hash2 is None:
+return False
+return order_hash1 == order_hash2
+
 def diff(feeder1, feeder2):
 end_nl_q1 = Queue()
 end_nl_q2 = Queue()
+order_hash_q1 = Queue()
+order_hash_q2 = Queue()
 
 with tempfile.TemporaryDirectory() as tmpdir:
 fifo1 = '{}/f1'.format(tmpdir)
 fifo2 = '{}/f2'.format(tmpdir)
-fd_from_feeder(feeder1, end_nl_q1, fifo1)
-fd_from_feeder(feeder2, end_nl_q2, fifo2)
-return run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2)
+fd_from_feeder(feeder1, end_nl_q1, order_hash_q1, fifo1)
+fd_from_feeder(feeder2, end_nl_q2, order_hash_q2, fifo2)
+diff_result = run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2)
+order_only_diff = 

Bug#848049: diffoscope: Add detection of order-only differences in plain text formats

2016-12-25 Thread Chris Lamb
Jérémy Bobbio wrote:

[…]
> h += hash(line)
[…]

Watch out, using hash() often leads to unreproducible output. :)


Regards,

-- 
  ,''`.
 : :'  : Chris Lamb
 `. `'`  la...@debian.org / chris-lamb.co.uk
   `-



Bug#848049: diffoscope: Add detection of order-only differences in plain text formats

2016-12-25 Thread Jérémy Bobbio
Hi!

Маша Глухова:
> The reason why I did not use some algorihm like that is that it requires to
> read files for the second time. Right now, all the actual work with the
> content of the files (except for the quick check for has_same_content) is
> delegated to diff, and on big files, it occupies most of the time. Assuming
> that for big files, reading them from drive would be the bottleneck, I
> tried to avoid reading them again, instead working with the result of the
> diff.
> Still, I would be happily mistaken. I will implement your version and
> compare the performance.

You would not have to read the file twice as long as you do the hash
in the difference module, when each line is actually fed to diff.
A similar trick is already used to cope with files that are too long,
see diffoscope.difference.make_feeder_from_raw_reader()

I don't know if my suggestions is a good one. It might not be a good
idea at all. Feel free to discuss it with your mentor before spending
too much time on it.

> Thank you again :)

PS: Please call me Lunar. :)

-- 
Lunar.''`. 
lu...@debian.org: :Ⓐ  :  # apt-get install anarchism
`. `'` 
  `-   


signature.asc
Description: Digital signature


Bug#848049: diffoscope: Add detection of order-only differences in plain text formats

2016-12-25 Thread Маша Глухова
Jeremy,
Thank you for sharing that!
The reason why I did not use some algorihm like that is that it requires to
read files for the second time. Right now, all the actual work with the
content of the files (except for the quick check for has_same_content) is
delegated to diff, and on big files, it occupies most of the time. Assuming
that for big files, reading them from drive would be the bottleneck, I
tried to avoid reading them again, instead working with the result of the
diff.
Still, I would be happily mistaken. I will implement your version and
compare the performance.
Thank you again :)

Maria Glukhova

вс, 25 дек. 2016, 13:37 Jérémy Bobbio :

> Маша Глухова:
> > I believe the attached patch would provide the requested functionality.
>
> Nice work! :)
>
> > From: Maria Glukhova 
> > Date: Sat, 24 Dec 2016 12:29:57 +0200
> > Subject: [PATCH] Add detection of order-only difference in plain text
> format.
> >
> > Detect if the text files' contents differ only in line ordering, and
> give an appropriate comment.
> > […]
> > +def order_only_difference(unified_diff):
> > +diff_lines = unified_diff.splitlines()
> > +added_lines = [line[1:] for line in diff_lines if
> line.startswith('+')]
> > +removed_lines = [line[1:] for line in diff_lines if
> line.startswith('-')]
> > +# Faster check: does number of lines match?
> > +if len(added_lines) != len(removed_lines):
> > +return False
> > +# Counter stores line and number of its occurrences.
> > +return sorted(added_lines) == sorted(removed_lines)
>
> I guess it's a fine approach to the problem, but I wonder if it would
> not be better to use a slightly less accurate strategy that would be
> nicer to memory and CPU.
>
> What I have in mind is to incrementally compute a hash value that would
> give the same result even if the lines are in different order.
>
> Drawing from discussions on StackOverflow [1], I think doing a sum of
> Python's hash() would work. My test was:
>
> def unordered_hash(lines):
> h = 0
> for line in lines:
> h += hash(line)
> return h
>
> h1 = unordered_hash(open('tests/data/text_order1').readlines())
> h2 = unordered_hash(open('tests/data/text_order2').readlines())
> print(h1, h2, h1 == h2)
>
> That way, it could probably be implemented directly in the difference
> module and work for other file types than just text files.
>
>  [1]:
> https://stackoverflow.com/questions/30734848/order-independant-hash-algorithm
>
> --
> Lunar.''`.
> lu...@debian.org: :Ⓐ  :  # apt-get install anarchism
> `. `'`
>   `-
>


Bug#848049: diffoscope: Add detection of order-only differences in plain text formats

2016-12-25 Thread Jérémy Bobbio
Маша Глухова:
> I believe the attached patch would provide the requested functionality.

Nice work! :)

> From: Maria Glukhova 
> Date: Sat, 24 Dec 2016 12:29:57 +0200
> Subject: [PATCH] Add detection of order-only difference in plain text format.
> 
> Detect if the text files' contents differ only in line ordering, and give an 
> appropriate comment.
> […]
> +def order_only_difference(unified_diff):
> +diff_lines = unified_diff.splitlines()
> +added_lines = [line[1:] for line in diff_lines if line.startswith('+')]
> +removed_lines = [line[1:] for line in diff_lines if line.startswith('-')]
> +# Faster check: does number of lines match?
> +if len(added_lines) != len(removed_lines):
> +return False
> +# Counter stores line and number of its occurrences.
> +return sorted(added_lines) == sorted(removed_lines)

I guess it's a fine approach to the problem, but I wonder if it would
not be better to use a slightly less accurate strategy that would be
nicer to memory and CPU.

What I have in mind is to incrementally compute a hash value that would
give the same result even if the lines are in different order.

Drawing from discussions on StackOverflow [1], I think doing a sum of
Python's hash() would work. My test was:

def unordered_hash(lines):
h = 0
for line in lines:
h += hash(line)
return h

h1 = unordered_hash(open('tests/data/text_order1').readlines())
h2 = unordered_hash(open('tests/data/text_order2').readlines())
print(h1, h2, h1 == h2)

That way, it could probably be implemented directly in the difference
module and work for other file types than just text files.

 [1]: 
https://stackoverflow.com/questions/30734848/order-independant-hash-algorithm

-- 
Lunar.''`. 
lu...@debian.org: :Ⓐ  :  # apt-get install anarchism
`. `'` 
  `-   


signature.asc
Description: Digital signature


Bug#848049: diffoscope: Add detection of order-only differences in plain text formats

2016-12-24 Thread Daniel Shahaf
Маша Глухова wrote on Sat, Dec 24, 2016 at 18:14:16 +:
> +def order_only_difference(unified_diff):
> +diff_lines = unified_diff.splitlines()
> +added_lines = [line[1:] for line in diff_lines if line.startswith('+')]
> +removed_lines = [line[1:] for line in diff_lines if line.startswith('-')]
> +# Faster check: does number of lines match?
> +if len(added_lines) != len(removed_lines):
> +return False
> +# Counter stores line and number of its occurrences.
> +return sorted(added_lines) == sorted(removed_lines)

What happens if one of the files has a trailing newline and one does
not?  Strictly speaking, that's not an "ordering only difference", but
this function doesn't seem to handle this case.

Example:

% diff -u <(echo foo) <(printf foo) 
--- /proc/self/fd/112016-12-24 20:24:22.064115616 +
+++ /proc/self/fd/122016-12-24 20:24:22.064115616 +
@@ -1 +1 @@
-foo
+foo
\ No newline at end of file

Cheers,

Daniel



Bug#848049: diffoscope: Add detection of order-only differences in plain text formats

2016-12-24 Thread Маша Глухова
I believe the attached patch would provide the requested functionality.
From 0ae6d16037cc4912e5a165ee050e31e99402c912 Mon Sep 17 00:00:00 2001
From: Maria Glukhova 
Date: Sat, 24 Dec 2016 12:29:57 +0200
Subject: [PATCH] Add detection of order-only difference in plain text format.

Detect if the text files' contents differ only in line ordering, and give an appropriate comment.

Closes: #848049
---
 diffoscope/comparators/text.py  | 14 ++
 tests/comparators/test_text.py  |  8 
 tests/data/text_order1  |  7 +++
 tests/data/text_order2  |  7 +++
 tests/data/text_order_expected_diff | 11 +++
 5 files changed, 47 insertions(+)
 create mode 100644 tests/data/text_order1
 create mode 100644 tests/data/text_order2
 create mode 100644 tests/data/text_order_expected_diff

diff --git a/diffoscope/comparators/text.py b/diffoscope/comparators/text.py
index 909ff98..f7f423f 100644
--- a/diffoscope/comparators/text.py
+++ b/diffoscope/comparators/text.py
@@ -24,6 +24,17 @@ from diffoscope.difference import Difference
 from diffoscope.comparators.binary import File
 
 
+def order_only_difference(unified_diff):
+diff_lines = unified_diff.splitlines()
+added_lines = [line[1:] for line in diff_lines if line.startswith('+')]
+removed_lines = [line[1:] for line in diff_lines if line.startswith('-')]
+# Faster check: does number of lines match?
+if len(added_lines) != len(removed_lines):
+return False
+# Counter stores line and number of its occurrences.
+return sorted(added_lines) == sorted(removed_lines)
+
+
 class TextFile(File):
 RE_FILE_TYPE = re.compile(r'\btext\b')
 
@@ -44,6 +55,9 @@ class TextFile(File):
 with codecs.open(self.path, 'r', encoding=my_encoding) as my_content, \
  codecs.open(other.path, 'r', encoding=other_encoding) as other_content:
 difference = Difference.from_text_readers(my_content, other_content, self.name, other.name, source)
+# Check if difference is only in line order.
+if difference and order_only_difference(difference.unified_diff):
+difference.add_comment("ordering differences only")
 if my_encoding != other_encoding:
 if difference is None:
 difference = Difference(None, self.path, other.path, source)
diff --git a/tests/comparators/test_text.py b/tests/comparators/test_text.py
index 9892826..afa0716 100644
--- a/tests/comparators/test_text.py
+++ b/tests/comparators/test_text.py
@@ -65,3 +65,11 @@ def test_difference_between_iso88591_and_unicode_only(iso8859, tmpdir):
 
 def test_compare_non_existing(monkeypatch, ascii1):
 assert_non_existing(monkeypatch, ascii1, has_null_source=False, has_details=False)
+
+text_order1 = load_fixture(data('text_order1'))
+text_order2 = load_fixture(data('text_order2'))
+
+def test_ordering_differences(text_order1, text_order2):
+difference = text_order1.compare(text_order2)
+assert difference.comments == ['ordering differences only']
+assert difference.unified_diff == open(data('text_order_expected_diff')).read()
diff --git a/tests/data/text_order1 b/tests/data/text_order1
new file mode 100644
index 000..9f85b81
--- /dev/null
+++ b/tests/data/text_order1
@@ -0,0 +1,7 @@
+These
+lines
+follow
+in
+some
+order
+.
diff --git a/tests/data/text_order2 b/tests/data/text_order2
new file mode 100644
index 000..7890b50
--- /dev/null
+++ b/tests/data/text_order2
@@ -0,0 +1,7 @@
+These
+some
+order
+follow
+in
+lines
+.
diff --git a/tests/data/text_order_expected_diff b/tests/data/text_order_expected_diff
new file mode 100644
index 000..2d8b915
--- /dev/null
+++ b/tests/data/text_order_expected_diff
@@ -0,0 +1,11 @@
+@@ -1,7 +1,7 @@
+ These
+-lines
+-follow
+-in
+ some
+ order
++follow
++in
++lines
+ .
-- 
2.11.0