Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-22 Thread Andy Polyakov
>>> * Fix ecp_nistz256_mul_by_2 and ecp_nistz256_mul_by_3 to fully reduce
>>> their outputs.
>>>
>>> * Fix ecp_nistz256_add to fully reduce its output.
>>
>> As for specifically addition see below. As for fixing mul_by_[23] and
>> the fact that they use addition. There are two ways. a) Modify addition
>> so that it *preserves* property of being fully reduced and leave
>> mul_by_[23] as is. b) Let addition as is and add additional step to
>> mul_by_[23]. The choice of approach can be platform-specific. For
>> example on x86_64 a) is simpler and appears more efficient.

After considering other ecp_nistz256-enabled platforms a) appears better
choice on all of them. It probably holds universally true, but I would
still mention b) in commentary...


-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-20 Thread Andy Polyakov
>> * Fix ecp_nistz256_mul_by_2 and ecp_nistz256_mul_by_3 to fully reduce
>> their outputs.
>>
>> * Fix ecp_nistz256_add to fully reduce its output.
> 
> As for specifically addition see below. As for fixing mul_by_[23] and
> the fact that they use addition. There are two ways. a) Modify addition
> so that it *preserves* property of being fully reduced and leave
> mul_by_[23] as is. b) Let addition as is and add additional step to
> mul_by_[23]. The choice of approach can be platform-specific. For
> example on x86_64 a) is simpler and appears more efficient.

Consider attached diff.

diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index cce92b9..cc7b976 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -135,6 +135,7 @@ ecp_nistz256_mul_by_2:
 	push	%r13
 
 	mov	8*0($a_ptr), $a0
+	xor	$t4,$t4
 	mov	8*1($a_ptr), $a1
 	add	$a0, $a0		# a0:a3+a0:a3
 	mov	8*2($a_ptr), $a2
@@ -145,7 +146,7 @@ ecp_nistz256_mul_by_2:
 	adc	$a2, $a2
 	adc	$a3, $a3
 	 mov	$a1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	8*0($a_ptr), $a0
 	 mov	$a2, $t2
@@ -153,14 +154,14 @@ ecp_nistz256_mul_by_2:
 	sbb	8*2($a_ptr), $a2
 	 mov	$a3, $t3
 	sbb	8*3($a_ptr), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovb	$t0, $a0
+	cmovb	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovb	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovb	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -257,12 +258,12 @@ ecp_nistz256_mul_by_3:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	.Lpoly+8*3(%rip), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
-	cmovz	$t2, $a2
-	cmovz	$t3, $a3
+	cmovb	$t0, $a0
+	cmovb	$t1, $a1
+	cmovb	$t2, $a2
+	cmovb	$t3, $a3
 
 	xor	$t4, $t4
 	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
@@ -279,14 +280,14 @@ ecp_nistz256_mul_by_3:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	.Lpoly+8*3(%rip), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovb	$t0, $a0
+	cmovb	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovb	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovb	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -325,14 +326,14 @@ ecp_nistz256_add:
 	sbb	8*2($a_ptr), $a2
 	 mov	$a3, $t3
 	sbb	8*3($a_ptr), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovb	$t0, $a0
+	cmovb	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovb	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovb	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -1890,13 +1891,14 @@ $code.=<<___;
 .type	__ecp_nistz256_add_toq,\@abi-omnipotent
 .align	32
 __ecp_nistz256_add_toq:
+	xor	$t4,$t4
 	add	8*0($b_ptr), $a0
 	adc	8*1($b_ptr), $a1
 	 mov	$a0, $t0
 	adc	8*2($b_ptr), $a2
 	adc	8*3($b_ptr), $a3
 	 mov	$a1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	\$-1, $a0
 	 mov	$a2, $t2
@@ -1904,14 +1906,14 @@ __ecp_nistz256_add_toq:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	$poly3, $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovb	$t0, $a0
+	cmovb	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovb	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovb	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -1979,13 +1981,14 @@ __ecp_nistz256_subq:
 .type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
 .align	32
 __ecp_nistz256_mul_by_2q:
+	xor	$t4, $t4
 	add	$a0, $a0		# a0:a3+a0:a3
 	adc	$a1, $a1
 	 mov	$a0, $t0
 	adc	$a2, $a2
 	adc	$a3, $a3
 	 mov	$a1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	\$-1, $a0
 	 mov	$a2, $t2
@@ -1993,14 +1996,14 @@ __ecp_nistz256_mul_by_2q:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	$poly3, $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovb	$t0, $a0
+	cmovb	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovb	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovb	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -2455,6 +2458,7 @@ $code.=<<___;
 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
 
+	xor	$t4, $t4
 	add	$acc0, $acc0		# a0:a3+a0:a3
 	lea	$Rsqr(%rsp), $a_ptr
 	adc	$acc1, $acc1
@@ -2462,7 +2466,7 @@ $code.=<<___;
 	adc	$acc2, $acc2
 	adc	$acc3, $acc3
 	 mov	$acc1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	\$-1, $acc0
 	 mov	$acc2, $t2
@@ -2470,15 +2474,15 @@ $code.=<<___;
 	sbb	\$0, $acc2
 	 mov	$acc3, $t3
 	sbb	$poly3, $acc3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $acc0
+	cmovb	$t0, $acc0
 	mov	8*0($a_ptr), $t0
-	cmovz	$t1, $acc1
+	cmovb	$t1, $acc1
 	mov	8*1($a_ptr), $t1
-	cmovz	$t2, $acc2
+	cmovb	$t2, $acc2
 	mov	8*2($a_ptr), $t2
-	cmovz	$t3, $acc3
+	cmovb	$t3, $acc3
 	mov	8*3($a_ptr), $t3
 
 	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
@@ -2760,6 +2764,7 @@ $code.=<<___;
 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
 
+	xor	$t4, $t4
 	add	$acc0, $acc0		# a0:a3+a0:a3
 	lea	$Rsqr(%rsp), $a_ptr
 	adc	$acc1, $acc1
@@ 

Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-19 Thread Andy Polyakov
>> It appears to me that with multiplication, squaring, subtraction,
>> negation, halving *preserving* property of being fully reduced (i.e. if
>> inputs are fully reduced, then output is too), we only have to watch out
>> for mul_by_[23], i.e. ensure that their outputs are fully reduced. This
>> would ensure that output will always be fully reduced.
> 
> Let me state thing in a different way, and see if this is what you
> meant: Every function will have as a prerequisite that its inputs are
> fully reduced and will have a postcondition that its output is fully
> reduced.

Assertion is that every function but addition and mul_by_[23] *has* (not
"will have", but "has") such postcondition that their outputs are fully
reduced as long as inputs are. Not to mention that multiplication (and
squaring) *can* (or should we say "is likely to") produce fully reduced
output even if inputs are not fully reduced.

> We know that ecp_nistz256_add doesn't fully reduce its output
> even if the input is fully reduced, and we know that
> ecp_nistz256_mul_by_[23] are implemented in terms of ecp_nistz256_add
> (or equivalent logic, in the case of some of the ASM stuff).
> Accordingly, the plan of action is:
> 
> * Fix ecp_nistz256_mul_by_2 and ecp_nistz256_mul_by_3 to fully reduce
> their outputs.
> 
> * Fix ecp_nistz256_add to fully reduce its output.

As for specifically addition see below. As for fixing mul_by_[23] and
the fact that they use addition. There are two ways. a) Modify addition
so that it *preserves* property of being fully reduced and leave
mul_by_[23] as is. b) Let addition as is and add additional step to
mul_by_[23]. The choice of approach can be platform-specific. For
example on x86_64 a) is simpler and appears more efficient. But on some
platforms b) could be better option.

> * Ensure in ecp_nistz256_points_mul that all the input coordinates are
> fully reduced.

I didn't mean to add any additional steps, but simply see that inputs
should be fully reduced. Simply put question is if output from
conversion to Montgomery representation is fully reduced. And then as
all involved subroutines would *preserve* the property, everything
remains fully reduced throughout the complete course.

> After all of this, we won't have to worry about the handling of
> partially-reduced values anywhere.
> 
> Is that correct? In particular, you said "we only have to watch out
> for mul_by_[23]" but you didn't mention ecp_nistz256_add, which *is*
> used directly in ecp_nistz256_point_double, according to the reference
> C implementation.

Rationale behind not explicitly mentioning addition is following
sequence from ecp_nistz256_point_double:

ecp_nistz256_add(M, in_x, Zsqr);
ecp_nistz256_mul_mont(M, M, Zsqr);
ecp_nistz256_mul_by_3(M, M);
ecp_nistz256_sqr_mont(res_x, M);

It doesn't matter if addition returns partially reduced result, as long
as mul_by_3 ties it up.

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-18 Thread Brian Smith
Andy Polyakov  wrote:
> It appears to me that with multiplication, squaring, subtraction,
> negation, halving *preserving* property of being fully reduced (i.e. if
> inputs are fully reduced, then output is too), we only have to watch out
> for mul_by_[23], i.e. ensure that their outputs are fully reduced. This
> would ensure that output will always be fully reduced.

Let me state thing in a different way, and see if this is what you
meant: Every function will have as a prerequisite that its inputs are
fully reduced and will have a postcondition that its output is fully
reduced. We know that ecp_nistz256_add doesn't fully reduce its output
even if the input is fully reduced, and we know that
ecp_nistz256_mul_by_[23] are implemented in terms of ecp_nistz256_add
(or equivalent logic, in the case of some of the ASM stuff).
Accordingly, the plan of action is:

* Fix ecp_nistz256_mul_by_2 and ecp_nistz256_mul_by_3 to fully reduce
their outputs.

* Fix ecp_nistz256_add to fully reduce its output.

* Ensure in ecp_nistz256_points_mul that all the input coordinates are
fully reduced.

After all of this, we won't have to worry about the handling of
partially-reduced values anywhere.

Is that correct? In particular, you said "we only have to watch out
for mul_by_[23]" but you didn't mention ecp_nistz256_add, which *is*
used directly in ecp_nistz256_point_double, according to the reference
C implementation.

I think that is a reasonable course of action.

I'd like to suggest the following additional steps:

* Verify with Vlad that he agrees with our assessment of the current
state of the code and the plan. In particular, I still worry that what
we agreed on here doesn't match what Vlad told me before. Maybe Vlad
was thinking about some other code he wrote that uses the Almost
Montgomery math instead of regular Montgomery math. But maybe he knows
something we are overlooking. In particular, we should ask him and
Shay whether they intentionally made ecp_nistz256_add (et al.) return
partially-reduced results because they'd proven that full reduction
isn't necessary given the subsequent statements.

* Document in the top of ecp_nistz256.c the input and output bounds
are [0, P256) for all field operations.

* Test some interesting edge cases of all the functions to verify that
they generate the correct output. (This series of bug reports started
by me doing this, starting with ecp_nistz256_add.)

* Because we can't easily test it, audit the inlined code inside the
assembly langauge implementations of ecp_nistz256_{double, add,
add_affine} to ensure that it matches the logic within the standalone
functions that we can write tests for. (As you noted, in many cases,
the standalone field arithmetic functions and the inline variants
actually share code; in some cases, however, they don't.)

> In this and RT#4621 combined context one can conclude that *as long as*
> inputs to ecp_nistz256_point_add are fully reduced, is_equal calls work
> correctly, because there are no non-full-reduction-preserving calls
> prior them. Would you agree?

If all functions have the postcondition that they always fully reduce
their outputs then I agree that the is_equal calls are correct, and
also that the ecp_nistz256_neg function seems to be correct since it
gives the expected results when given fully reduced inputs.

Cheers,
Brian
-- 
https://briansmith.org/
-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-18 Thread Kurt Roeckx
On Thu, Aug 18, 2016 at 04:24:56PM +0200, Andy Polyakov wrote:
> >> I think you are assuming that ret is in the range [0, 2P), so that if
> >> you subtract P, the result would be in the range [0, P). That is the
> >> case in normal Montgomery multiplication, where the inputs are in the
> >> range [0, P). But, my understanding is that if the inputs are in the
> >> range [P, 2**256), e.g. they are the result of ecp_nistz256_add, then
> >> that assumption doesn't necessarily hold.
> > 
> > Looks like you are right. I mean it indeed appears to be possible for
> > multiplication (and squaring) subroutine to return partially reduced
> > result. But *only* if input was partially reduced. I.e. if input is
> > fully reduced, the output *shall* be too. And if input is not fully
> > reduced, then output *can* be.
> 
> It appears to me that with multiplication, squaring, subtraction,
> negation, halving *preserving* property of being fully reduced (i.e. if
> inputs are fully reduced, then output is too), we only have to watch out
> for mul_by_[23], i.e. ensure that their outputs are fully reduced. This
> would ensure that output will always be fully reduced.

Can you document some of those things?


Kurt

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-18 Thread Andy Polyakov
>> I think you are assuming that ret is in the range [0, 2P), so that if
>> you subtract P, the result would be in the range [0, P). That is the
>> case in normal Montgomery multiplication, where the inputs are in the
>> range [0, P). But, my understanding is that if the inputs are in the
>> range [P, 2**256), e.g. they are the result of ecp_nistz256_add, then
>> that assumption doesn't necessarily hold.
> 
> Looks like you are right. I mean it indeed appears to be possible for
> multiplication (and squaring) subroutine to return partially reduced
> result. But *only* if input was partially reduced. I.e. if input is
> fully reduced, the output *shall* be too. And if input is not fully
> reduced, then output *can* be.

It appears to me that with multiplication, squaring, subtraction,
negation, halving *preserving* property of being fully reduced (i.e. if
inputs are fully reduced, then output is too), we only have to watch out
for mul_by_[23], i.e. ensure that their outputs are fully reduced. This
would ensure that output will always be fully reduced.

In this and RT#4621 combined context one can conclude that *as long as*
inputs to ecp_nistz256_point_add are fully reduced, is_equal calls work
correctly, because there are no non-full-reduction-preserving calls
prior them. Would you agree?

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-17 Thread Andy Polyakov
>>> My understand after talking with Vlad that the "sbb \$0, $acc2" makes
>>> this equivalent to (r >= 2**256) ? (r - q) : r. If the "sbb \$0,
>>> $acc2" line were removed then it would be equivalent to (r >= q) ? (r
>>> - q) : r. My understanding is that the difference in semantics is
>>> exactly the difference between partially reduced results and fully
>>> reduced results.
>>
>> Let's recall that result of multiplication prior final reduction is
>> actually n+1-limb value, with +1 limb being single bit, that's $acc2,
>> 5th limb in the context. So that the $0 in last 'sbb \$0,$acc2'
>> represents 5th ("imaginary") limb of modulus[!]. And since we're looking
>> at borrow from this 5-limb subtraction, outcome is actually
>>
>> if (ret > P) ret -= P'
> 
> OK, let's agree on that.

Actually correct expression is 'if (ret >= P) ret -= P'.

> I think you are assuming that ret is in the range [0, 2P), so that if
> you subtract P, the result would be in the range [0, P). That is the
> case in normal Montgomery multiplication, where the inputs are in the
> range [0, P). But, my understanding is that if the inputs are in the
> range [P, 2**256), e.g. they are the result of ecp_nistz256_add, then
> that assumption doesn't necessarily hold.

Looks like you are right. I mean it indeed appears to be possible for
multiplication (and squaring) subroutine to return partially reduced
result. But *only* if input was partially reduced. I.e. if input is
fully reduced, the output *shall* be too. And if input is not fully
reduced, then output *can* be. And condition for "can" appears somewhat
tricky. If not fully reduced input was force-reduced and final condition
in 'if (ret >= P) ret -= P' was true, then output would be not fully
reduced.

> I understand Almost Montgomery Multiplication (AMM) as described in
> [1], where one accepts inputs in the range [0, 2**n] and returns a
> result in the range [0, 2**n), not [0, P).

And that one checks only the top bit. This means that partially reduced
output is possible even for fully reduced input.

> I also read the original paper on the ecp_nistz256 implementation [2],
> which has "0 ≤ a, b < p" as a precondition for the Montgomery
> multiplication.
> 
> To put my concern another way: Earlier in the thread, you said the
> function can take inputs that aren't fully reduced--i.e. are in in the
> range [0, 2**n)--and returns outputs that are fully reduced--i.e. in
> the range [0, P)

I was wrong, sorry about confusion.

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-16 Thread Brian Smith
Andy Polyakov  wrote:
>> My understand after talking with Vlad that the "sbb \$0, $acc2" makes
>> this equivalent to (r >= 2**256) ? (r - q) : r. If the "sbb \$0,
>> $acc2" line were removed then it would be equivalent to (r >= q) ? (r
>> - q) : r. My understanding is that the difference in semantics is
>> exactly the difference between partially reduced results and fully
>> reduced results.
>
> Let's recall that result of multiplication prior final reduction is
> actually n+1-limb value, with +1 limb being single bit, that's $acc2,
> 5th limb in the context. So that the $0 in last 'sbb \$0,$acc2'
> represents 5th ("imaginary") limb of modulus[!]. And since we're looking
> at borrow from this 5-limb subtraction, outcome is actually
>
> if (ret > P) ret -= P'

OK, let's agree on that.

I think you are assuming that ret is in the range [0, 2P), so that if
you subtract P, the result would be in the range [0, P). That is the
case in normal Montgomery multiplication, where the inputs are in the
range [0, P). But, my understanding is that if the inputs are in the
range [P, 2**256), e.g. they are the result of ecp_nistz256_add, then
that assumption doesn't necessarily hold.

I understand Almost Montgomery Multiplication (AMM) as described in
[1], where one accepts inputs in the range [0, 2**n] and returns a
result in the range [0, 2**n), not [0, P).

I also read the original paper on the ecp_nistz256 implementation [2],
which has "0 ≤ a, b < p" as a precondition for the Montgomery
multiplication.

To put my concern another way: Earlier in the thread, you said the
function can take inputs that aren't fully reduced--i.e. are in in the
range [0, 2**n)--and returns outputs that are fully reduced--i.e. in
the range [0, P)--but it isn't clear how that is achieved. My
understanding is that the domain and range of the function are the
same.

[1] https://eprint.iacr.org/2011/239.pdf
[2] https://eprint.iacr.org/2013/816.pdf

Cheers,
Brian
-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-16 Thread Andy Polyakov
> Let's recall that result of multiplication prior final reduction is
> actually n+1-limb value, with +1 limb being single bit,

This came out wrong. Result is N+1 *bits* wide, it's just in this
particular case you have to spend additional limb on the the additional
bit. It's just that particular cases are most common ones, that's why
you'd tend to put it as wrong as above :-)

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-16 Thread Andy Polyakov
> ... I re-read the code for the conditional subtraction at the
> end of ecp_nistz256_mul_mont (__ecp_nistz256_mul_montq, actually) and
> I couldn't convince myself that the result was always fully reduced.
> 
> My concern is that what you say and what Vlad said is contradictory.
> You both understand the code way better than me, so I feel like I'm
> not so useful in resolving the contradiction. But, I will try anyway:
> 
> sbb $poly3, $acc1 # .Lpoly[3]
> sbb \$0, $acc2
> 
> cmovc $t0, $acc4
> cmovc $t1, $acc5
> 
> My understand after talking with Vlad that the "sbb \$0, $acc2" makes
> this equivalent to (r >= 2**256) ? (r - q) : r. If the "sbb \$0,
> $acc2" line were removed then it would be equivalent to (r >= q) ? (r
> - q) : r. My understanding is that the difference in semantics is
> exactly the difference between partially reduced results and fully
> reduced results.

Let's recall that result of multiplication prior final reduction is
actually n+1-limb value, with +1 limb being single bit, that's $acc2,
5th limb in the context. So that the $0 in last 'sbb \$0,$acc2'
represents 5th ("imaginary") limb of modulus[!]. And since we're looking
at borrow from this 5-limb subtraction, outcome is actually

if (ret > P) ret -= P'

Effectively that is. As reality is rather

temp = ret; ret -= P; if (borrow, i.e. ret < P) ret = temp

For reference, if one wanted to compare result of multiplication to
2^256 it would be sufficient to check for $acc2 being non-zero, but that
doesn't actually work.

> Another way to see this is that there are 5 sbb instructions

I assume that "5 sbb" actually means "1 sub + 4 sbb".

> issued
> for the conditional subtraction, which means 5 limbs are involved.
> But, a full reduction mod q should only have 4 sbb instructions,
> right?

If you checked for $acc2 being non-zero, i.e. compare to 2^256, chain of
four subtraction instructions would suffice, yes. But that's not what we
aim for.

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-16 Thread Brian Smith
Andy Polyakov  wrote:
> And it's not only that multiplication (and squaring) result is fully
> reduced, it, multiplication (ans squaring) subroutine can actually
> manage partially reduced input. On related note one can point out that
> result of addition (and mul_by_[2|3]) is partially reduced. But it's
> multiplication's ability to handle it that ties things up. One should
> also remember that it always ends with multiplication when result is
> converted from Montgomery representation. As well as that it starts with
> multiplication when input is converted to Montgomery representation...

Andy, thanks! Let's rewind a bit:

Originally I was writing tests and the first function I tested the
reduction for was ecp_nistz256_add.

I was surprised to find that the result is not necessarily unreduced.
I emailed the original authors from Intel (Vlad and Shay) asking if
this was expected and whether any other functions return unreduced
results. Vlad's reply was that *all* of the functions are
expected/assumed to return unreduced inputs. I then followed up asking
specifically about whether the result of the multiplication could be
only partially reduced. I received another reply that the result of
the multiplication could be only partially reduced and that some code
I'd written for BoringSSL that assumes that the multiplication result
was always fully reduced was wrong.

After that, I re-read the code for the conditional subtraction at the
end of ecp_nistz256_mul_mont (__ecp_nistz256_mul_montq, actually) and
I couldn't convince myself that the result was always fully reduced.

My concern is that what you say and what Vlad said is contradictory.
You both understand the code way better than me, so I feel like I'm
not so useful in resolving the contradiction. But, I will try anyway:

sbb $poly3, $acc1 # .Lpoly[3]
sbb \$0, $acc2

cmovc $t0, $acc4
cmovc $t1, $acc5

My understand after talking with Vlad that the "sbb \$0, $acc2" makes
this equivalent to (r >= 2**256) ? (r - q) : r. If the "sbb \$0,
$acc2" line were removed then it would be equivalent to (r >= q) ? (r
- q) : r. My understanding is that the difference in semantics is
exactly the difference between partially reduced results and fully
reduced results.

Another way to see this is that there are 5 sbb instructions issued
for the conditional subtraction, which means 5 limbs are involved.
But, a full reduction mod q should only have 4 sbb instructions,
right?

Again, I could very well be horrifically misunderstanding things.
Perhaps it would be a good idea to ask Vlad to weigh in again?

Cheers,
Brian
-- 
https://briansmith.org/
-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-16 Thread Andy Polyakov
>>> No, it subtraction subroutine uses *borrow* to determine if modulus is
>>> to be added. I.e. (a >= b) ? (a - b) : (P - (b - a)). If both a and b
>>> are less than P, then result is less than P.
>>
>> Consider the case where a > P and a >= b and b is very small (e.g. 1).
>> For example, a == P + 2 and b == 1, so a >= b, and a - b == P + 2 - 1
>> == P + 1.
> 
> But assertion was "if *both* a and b are less than P". I can also tell
> that multiplication result is fully reduced.

And it's not only that multiplication (and squaring) result is fully
reduced, it, multiplication (ans squaring) subroutine can actually
manage partially reduced input. On related note one can point out that
result of addition (and mul_by_[2|3]) is partially reduced. But it's
multiplication's ability to handle it that ties things up. One should
also remember that it always ends with multiplication when result is
converted from Montgomery representation. As well as that it starts with
multiplication when input is converted to Montgomery representation...

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-15 Thread Andy Polyakov
>> No, it subtraction subroutine uses *borrow* to determine if modulus is
>> to be added. I.e. (a >= b) ? (a - b) : (P - (b - a)). If both a and b
>> are less than P, then result is less than P.
> 
> Consider the case where a > P and a >= b and b is very small (e.g. 1).
> For example, a == P + 2 and b == 1, so a >= b, and a - b == P + 2 - 1
> == P + 1.

But assertion was "if *both* a and b are less than P". I can also tell
that multiplication result is fully reduced. So that if we consider
referred snippets, one from *your* previous message, then we'll see that
if we assume that inputs to subtraction subroutines are reduced, then
result of subtraction is too. You also established that negate for fully
reduced input works correctly. And then it all boils down to initial
user input, right?

Off for today...
-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-15 Thread Brian Smith
Andy Polyakov  wrote:
> No, it subtraction subroutine uses *borrow* to determine if modulus is
> to be added. I.e. (a >= b) ? (a - b) : (P - (b - a)). If both a and b
> are less than P, then result is less than P.

Consider the case where a > P and a >= b and b is very small (e.g. 1).
For example, a == P + 2 and b == 1, so a >= b, and a - b == P + 2 - 1
== P + 1.

Of course, this reduces the question of whether the multiplication
that precedes the subtraction can ever have a result in [P, 2**256 -
1).

Cheers,
Brian
-- 
https://briansmith.org/
-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-15 Thread Andy Polyakov
>>> Note in particular that, IIUC, ecp_nistz256_neg will never get an
>>> unreduced input when applied to the the based point multiples, because
>>> those are already fully reduced. But, when it is used in
>>> ecp_nistz256_windowed_mul, it isn't clear whether or how the input Y
>>> coordinate is fully reduced mod P before passed to ecp_nistz256_neg.
>>
>> Is it correctly understood that concern is that input to
>> ecp_nistz256_windowed_mul, which in turn can be *user* input, would be
>> not fully reduced?
> 
> Let's assume, for the purpose of this discussion, that the coordinates
> of the user input point `p` is already reduced, or that we will reject
> it or reduce it.
> 
> Given that input point, we calculate multiples 1*p, 2*p, ... 16*p
> using ecp_nistz256_point_double and ecp_nistz256_point_add. Let's look
> at how ecp_nistz256_point_double and ecp_nistz256_point_add calculates
> the Y coordinate:
> 
> double:
> ...
> ecp_nistz256_mul_mont(S, S, M);
> ecp_nistz256_sub(res_y, S, res_y);
> 
> add:
> ...
> ecp_nistz256_mul_mont(res_y, R, res_y);
> ecp_nistz256_sub(res_y, res_y, S2);
> memcpy(r->Y, res_y, sizeof(res_y));
> 
> Now, I was told by one of the authors of the original code that all
> the ecp_nistz256_* field operations return a result in the range [0,
> 2**256), not [0, P). This can be seen in the reduction code, e.g. in
> ecp_nistz256_sub. It uses the result of "sbb \$0, $t4" to determine if
> the result is larger than 2**256 - 1. If the result is larger than
> 2**256 - 1 then it subtracts P. Otherwise, it returns the result. But,
> in the case where it doesn't subtract P, the result might be in the
> range [P, 2**256); i.e. not fully reduced.

No, it subtraction subroutine uses *borrow* to determine if modulus is
to be added. I.e. (a >= b) ? (a - b) : (P - (b - a)). If both a and b
are less than P, then result is less than P.

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-15 Thread Andy Polyakov
> C-callables are wrappers around inlined subroutines. The only thing they
> do is load input into designated registers and call inlines, those used
> in point functions.

It's true for modules other than x86_64, but not x86_64 one. Sorry about
confusion.

-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


Re: [openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-15 Thread Brian Smith
Andy Polyakov  wrote:
>> Note in particular that, IIUC, ecp_nistz256_neg will never get an
>> unreduced input when applied to the the based point multiples, because
>> those are already fully reduced. But, when it is used in
>> ecp_nistz256_windowed_mul, it isn't clear whether or how the input Y
>> coordinate is fully reduced mod P before passed to ecp_nistz256_neg.
>
> Is it correctly understood that concern is that input to
> ecp_nistz256_windowed_mul, which in turn can be *user* input, would be
> not fully reduced?

Let's assume, for the purpose of this discussion, that the coordinates
of the user input point `p` is already reduced, or that we will reject
it or reduce it.

Given that input point, we calculate multiples 1*p, 2*p, ... 16*p
using ecp_nistz256_point_double and ecp_nistz256_point_add. Let's look
at how ecp_nistz256_point_double and ecp_nistz256_point_add calculates
the Y coordinate:

double:
...
ecp_nistz256_mul_mont(S, S, M);
ecp_nistz256_sub(res_y, S, res_y);

add:
...
ecp_nistz256_mul_mont(res_y, R, res_y);
ecp_nistz256_sub(res_y, res_y, S2);
memcpy(r->Y, res_y, sizeof(res_y));

Now, I was told by one of the authors of the original code that all
the ecp_nistz256_* field operations return a result in the range [0,
2**256), not [0, P). This can be seen in the reduction code, e.g. in
ecp_nistz256_sub. It uses the result of "sbb \$0, $t4" to determine if
the result is larger than 2**256 - 1. If the result is larger than
2**256 - 1 then it subtracts P. Otherwise, it returns the result. But,
in the case where it doesn't subtract P, the result might be in the
range [P, 2**256); i.e. not fully reduced.

NOTE: I may not be understanding it correctly! It's a very real
possibility. But, my understanding seems to match what I was told,
which is that the results of all the field operations are not fully
reduced.

Cheers,
Brian
-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev


[openssl-dev] Partially- vs. full- reduced inputs to ecp_nistz256_neg

2016-08-09 Thread Brian Smith
I was curious about the function ecp_nistz256_neg. This function seems
to work exactly how I expect for reduced inputs; i.e. inputs in the
range [0, P). And, it also seems to work how I expect for P:
ecp_nistz256_neg(P) == ecp_nistz256_neg(0) == 0. So, everything seems
fine for inputs in the range [0, P].

But, I don't understand how it works for the value P + 1. I expect
that one of the following is true:

ecp_nistz256_neg(P + 1) == ecp_nistz256_neg(1)
ecp_nistz256_neg(P + 1) == ecp_nistz256_neg(1) + P.

But, instead, ecp_nistz256_neg(P + 1) returns a result of 2**256 - 1. That is:

input = 00010001
expected = 0001fffe
actual: 

Similarly, I decided to test ecp_nistz256_neg(2**256 - 1). I expected
that one of the following would be true:

ecp_nistz256_neg(2**256 - 1) = ecp_nistz256_neg(2**256 - 1 - P)
ecp_nistz256_neg(2**256 - 1) == ecp_nistz256_neg(2**256 - 1 - P) + P.

But, instead, ecp_nistz256_neg(2**256 - 1) == 1. That is:

input = 0001fffe
expected = 
actual: 01

Based on these two results, then, it seems that when the input is in
the range [P + 1, 2**256 - 1], the result is actually the negation
taken modulo 2**256, instead of the negation modulo p256 (perhaps not
fully reduced).

I don't know if this is actually problematic, but it was surprising to me.

It seems to me that it would be a good idea for ecp_nistz256_neg to
first reduce its input modulo P (i.e. do a conditional subtraction of
P) before it does its calculation. At least, this would make it clear
that it is correct.

Note in particular that, IIUC, ecp_nistz256_neg will never get an
unreduced input when applied to the the based point multiples, because
those are already fully reduced. But, when it is used in
ecp_nistz256_windowed_mul, it isn't clear whether or how the input Y
coordinate is fully reduced mod P before passed to ecp_nistz256_neg.

More generally, I'm think it might be a good idea to unit test all of
the primitive operations in ecp_nistz256, with particular emphasis
placed on whether unreduced inputs are supposed to be accepted for
certain functions and, if so, whether unreduced inputs are handled
correctly.

And also, since many of the ecp_nistz256 field arithmetic functions
are inlined into the ecp_nistz256_point functions, I think it would be
worthwhile to review that the inlined versions of those functions
actually are operating in the same way as the analogous standalone
(C-callable) ecp_nistz256_* functions.

Sorry if I'm overlooking something fundamental, which I admit is
likely. Any help is appreciated.

Cheers,
Brian
--
https://briansmith.org/
-- 
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev