Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu
On Mon, Mar 19, 2018 at 1:30 PM, Eric Biggers  wrote:
> On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
>> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
>> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> >> maintainer]
>> >>
>> >> On 10.03.2018 20:17, Andiry Xu wrote:
>> >> 
>> >>
>> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> >> > +{
>> >> > +   u8 *ptr = (u8 *) data;
>> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> >> > +   u32 csum;
>> >> > +
>> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> >> > +   /* This inline assembly implementation should be equivalent
>> >> > +* to the kernel's crc32c_intel_le_hw() function used by
>> >> > +* crc32c(), but this performs better on test machines.
>> >> > +*/
>> >> > +   while (len > 8) {
>> >> > +   asm volatile(/* 64b quad words */
>> >> > +   "crc32q (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr += 8;
>> >> > +   len -= 8;
>> >> > +   }
>> >> > +
>> >> > +   while (len > 0) {
>> >> > +   asm volatile(/* trailing bytes */
>> >> > +   "crc32b (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr++;
>> >> > +   len--;
>> >> > +   }
>> >> > +
>> >> > +   csum = (u32) acc;
>> >> > +   } else {
>> >> > +   /* The kernel's crc32c() function should also detect and 
>> >> > use the
>> >> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> >> > function
>> >> > +* is about 3x to 5x slower than the inline assembly 
>> >> > version on
>> >> > +* some test machines.
>> >>
>> >> That is really odd. Did you try to characterize why this is the case? Is
>> >> it purely the overhead of dispatching to the correct backend function?
>> >> That's a rather big performance hit.
>> >>
>> >> > +*/
>> >> > +   csum = crc32c(crc, data, len);
>> >> > +   }
>> >> > +
>> >> > +   return csum;
>> >> > +}
>> >> > +
>> >
>> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests 
>> > and
>> > that the accelerated version was being called?  Or, perhaps 
>> > CRC32C_PCL_BREAKEVEN
>> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  
>> > Please
>> > don't hack around performance problems like this; if they exist, they need 
>> > to be
>> > fixed for everyone.
>> >
>>
>> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
>> memory at 1066MHz platform.
>> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
>> performance significantly. nova_crc32c() is still slightly faster than
>> crc32c() with the flag enabled.
>>
>> Result numbers are follows: data size in bytes, latency in ns, column
>> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
>> disabled.
>>
>> data size (bytes)nova_crc32c()crc32c() -enabled
>> crc32c() -disabled
>> 64  19   21 56
>> 12828   29 99
>> 25646   43 182
>> 51282   149 354
>> 1024  157 232 728
>> 2048  305 415 1440
>> 4096  603 725 2869
>>
>
> Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I 
> suggested
> may be the case; notice that your measured speeds are about the same before 
> 512
> (CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
> bytes.   It would be possible to set the breakeven point in
> crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is 
> not
> good enough you need to fix it for everyone, not hack around it.
>

We verify that by setting CRC32C_PCL_BREAKEVEN to 8192, the
performance difference between nova_crc32c() and kernel's crc32c() is
negligible. Thanks for the comments, and I will use kernel's crc32c()
in the next version.

Thanks,
Andiry


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu
On Mon, Mar 19, 2018 at 1:30 PM, Eric Biggers  wrote:
> On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
>> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
>> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> >> maintainer]
>> >>
>> >> On 10.03.2018 20:17, Andiry Xu wrote:
>> >> 
>> >>
>> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> >> > +{
>> >> > +   u8 *ptr = (u8 *) data;
>> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> >> > +   u32 csum;
>> >> > +
>> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> >> > +   /* This inline assembly implementation should be equivalent
>> >> > +* to the kernel's crc32c_intel_le_hw() function used by
>> >> > +* crc32c(), but this performs better on test machines.
>> >> > +*/
>> >> > +   while (len > 8) {
>> >> > +   asm volatile(/* 64b quad words */
>> >> > +   "crc32q (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr += 8;
>> >> > +   len -= 8;
>> >> > +   }
>> >> > +
>> >> > +   while (len > 0) {
>> >> > +   asm volatile(/* trailing bytes */
>> >> > +   "crc32b (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr++;
>> >> > +   len--;
>> >> > +   }
>> >> > +
>> >> > +   csum = (u32) acc;
>> >> > +   } else {
>> >> > +   /* The kernel's crc32c() function should also detect and 
>> >> > use the
>> >> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> >> > function
>> >> > +* is about 3x to 5x slower than the inline assembly 
>> >> > version on
>> >> > +* some test machines.
>> >>
>> >> That is really odd. Did you try to characterize why this is the case? Is
>> >> it purely the overhead of dispatching to the correct backend function?
>> >> That's a rather big performance hit.
>> >>
>> >> > +*/
>> >> > +   csum = crc32c(crc, data, len);
>> >> > +   }
>> >> > +
>> >> > +   return csum;
>> >> > +}
>> >> > +
>> >
>> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests 
>> > and
>> > that the accelerated version was being called?  Or, perhaps 
>> > CRC32C_PCL_BREAKEVEN
>> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  
>> > Please
>> > don't hack around performance problems like this; if they exist, they need 
>> > to be
>> > fixed for everyone.
>> >
>>
>> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
>> memory at 1066MHz platform.
>> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
>> performance significantly. nova_crc32c() is still slightly faster than
>> crc32c() with the flag enabled.
>>
>> Result numbers are follows: data size in bytes, latency in ns, column
>> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
>> disabled.
>>
>> data size (bytes)nova_crc32c()crc32c() -enabled
>> crc32c() -disabled
>> 64  19   21 56
>> 12828   29 99
>> 25646   43 182
>> 51282   149 354
>> 1024  157 232 728
>> 2048  305 415 1440
>> 4096  603 725 2869
>>
>
> Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I 
> suggested
> may be the case; notice that your measured speeds are about the same before 
> 512
> (CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
> bytes.   It would be possible to set the breakeven point in
> crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is 
> not
> good enough you need to fix it for everyone, not hack around it.
>

We verify that by setting CRC32C_PCL_BREAKEVEN to 8192, the
performance difference between nova_crc32c() and kernel's crc32c() is
negligible. Thanks for the comments, and I will use kernel's crc32c()
in the next version.

Thanks,
Andiry


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Eric Biggers
On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
> >> maintainer]
> >>
> >> On 10.03.2018 20:17, Andiry Xu wrote:
> >> 
> >>
> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> >> > +{
> >> > +   u8 *ptr = (u8 *) data;
> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> >> > +   u32 csum;
> >> > +
> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> >> > +   /* This inline assembly implementation should be equivalent
> >> > +* to the kernel's crc32c_intel_le_hw() function used by
> >> > +* crc32c(), but this performs better on test machines.
> >> > +*/
> >> > +   while (len > 8) {
> >> > +   asm volatile(/* 64b quad words */
> >> > +   "crc32q (%1), %0"
> >> > +   : "=r" (acc)
> >> > +   : "r"  (ptr), "0" (acc)
> >> > +   );
> >> > +   ptr += 8;
> >> > +   len -= 8;
> >> > +   }
> >> > +
> >> > +   while (len > 0) {
> >> > +   asm volatile(/* trailing bytes */
> >> > +   "crc32b (%1), %0"
> >> > +   : "=r" (acc)
> >> > +   : "r"  (ptr), "0" (acc)
> >> > +   );
> >> > +   ptr++;
> >> > +   len--;
> >> > +   }
> >> > +
> >> > +   csum = (u32) acc;
> >> > +   } else {
> >> > +   /* The kernel's crc32c() function should also detect and use 
> >> > the
> >> > +* crc32 instruction of SSE-4.2. But calling in to this 
> >> > function
> >> > +* is about 3x to 5x slower than the inline assembly version 
> >> > on
> >> > +* some test machines.
> >>
> >> That is really odd. Did you try to characterize why this is the case? Is
> >> it purely the overhead of dispatching to the correct backend function?
> >> That's a rather big performance hit.
> >>
> >> > +*/
> >> > +   csum = crc32c(crc, data, len);
> >> > +   }
> >> > +
> >> > +   return csum;
> >> > +}
> >> > +
> >
> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests 
> > and
> > that the accelerated version was being called?  Or, perhaps 
> > CRC32C_PCL_BREAKEVEN
> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  
> > Please
> > don't hack around performance problems like this; if they exist, they need 
> > to be
> > fixed for everyone.
> >
> 
> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
> memory at 1066MHz platform.
> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
> performance significantly. nova_crc32c() is still slightly faster than
> crc32c() with the flag enabled.
> 
> Result numbers are follows: data size in bytes, latency in ns, column
> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
> disabled.
> 
> data size (bytes)nova_crc32c()crc32c() -enabled
> crc32c() -disabled
> 64  19   21 56
> 12828   29 99
> 25646   43 182
> 51282   149 354
> 1024  157 232 728
> 2048  305 415 1440
> 4096  603 725 2869
> 

Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I suggested
may be the case; notice that your measured speeds are about the same before 512
(CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
bytes.   It would be possible to set the breakeven point in
crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is not
good enough you need to fix it for everyone, not hack around it.

Thanks,

Eric


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Eric Biggers
On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
> >> maintainer]
> >>
> >> On 10.03.2018 20:17, Andiry Xu wrote:
> >> 
> >>
> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> >> > +{
> >> > +   u8 *ptr = (u8 *) data;
> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> >> > +   u32 csum;
> >> > +
> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> >> > +   /* This inline assembly implementation should be equivalent
> >> > +* to the kernel's crc32c_intel_le_hw() function used by
> >> > +* crc32c(), but this performs better on test machines.
> >> > +*/
> >> > +   while (len > 8) {
> >> > +   asm volatile(/* 64b quad words */
> >> > +   "crc32q (%1), %0"
> >> > +   : "=r" (acc)
> >> > +   : "r"  (ptr), "0" (acc)
> >> > +   );
> >> > +   ptr += 8;
> >> > +   len -= 8;
> >> > +   }
> >> > +
> >> > +   while (len > 0) {
> >> > +   asm volatile(/* trailing bytes */
> >> > +   "crc32b (%1), %0"
> >> > +   : "=r" (acc)
> >> > +   : "r"  (ptr), "0" (acc)
> >> > +   );
> >> > +   ptr++;
> >> > +   len--;
> >> > +   }
> >> > +
> >> > +   csum = (u32) acc;
> >> > +   } else {
> >> > +   /* The kernel's crc32c() function should also detect and use 
> >> > the
> >> > +* crc32 instruction of SSE-4.2. But calling in to this 
> >> > function
> >> > +* is about 3x to 5x slower than the inline assembly version 
> >> > on
> >> > +* some test machines.
> >>
> >> That is really odd. Did you try to characterize why this is the case? Is
> >> it purely the overhead of dispatching to the correct backend function?
> >> That's a rather big performance hit.
> >>
> >> > +*/
> >> > +   csum = crc32c(crc, data, len);
> >> > +   }
> >> > +
> >> > +   return csum;
> >> > +}
> >> > +
> >
> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests 
> > and
> > that the accelerated version was being called?  Or, perhaps 
> > CRC32C_PCL_BREAKEVEN
> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  
> > Please
> > don't hack around performance problems like this; if they exist, they need 
> > to be
> > fixed for everyone.
> >
> 
> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
> memory at 1066MHz platform.
> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
> performance significantly. nova_crc32c() is still slightly faster than
> crc32c() with the flag enabled.
> 
> Result numbers are follows: data size in bytes, latency in ns, column
> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
> disabled.
> 
> data size (bytes)nova_crc32c()crc32c() -enabled
> crc32c() -disabled
> 64  19   21 56
> 12828   29 99
> 25646   43 182
> 51282   149 354
> 1024  157 232 728
> 2048  305 415 1440
> 4096  603 725 2869
> 

Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I suggested
may be the case; notice that your measured speeds are about the same before 512
(CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
bytes.   It would be possible to set the breakeven point in
crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is not
good enough you need to fix it for everyone, not hack around it.

Thanks,

Eric


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
memory at 1066MHz platform.
You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
performance significantly. nova_crc32c() is still slightly faster than
crc32c() with the flag enabled.

Result numbers are follows: data size in bytes, latency in ns, column
3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
disabled.

data size (bytes)nova_crc32c()crc32c() -enabled
crc32c() -disabled
64  19   21
56
12828   29
   99
25646   43
   182
51282   149
  354
1024  157 232
728
2048  305 415
1440
4096  603 725
2869

Thanks,
Andiry


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
memory at 1066MHz platform.
You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
performance significantly. nova_crc32c() is still slightly faster than
crc32c() with the flag enabled.

Result numbers are follows: data size in bytes, latency in ns, column
3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
disabled.

data size (bytes)nova_crc32c()crc32c() -enabled
crc32c() -disabled
64  19   21
56
12828   29
   99
25646   43
   182
51282   149
  354
1024  157 232
728
2048  305 415
1440
4096  603 725
2869

Thanks,
Andiry


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Andiry Xu
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I think we found the issue when implementing NOVA-Fortis metadata and
data protections, which use crc32c a lot. They have been removed in
this patchset; but I will double check and make sure if the issue
exists or not.

Thanks,
Andiry

> Eric


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Andiry Xu
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I think we found the issue when implementing NOVA-Fortis metadata and
data protections, which use crc32c a lot. They have been removed in
this patchset; but I will double check and make sure if the issue
exists or not.

Thanks,
Andiry

> Eric


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Eric Biggers
On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
> maintainer]
> 
> On 10.03.2018 20:17, Andiry Xu wrote:
> 
> 
> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> > +{
> > +   u8 *ptr = (u8 *) data;
> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> > +   u32 csum;
> > +
> > +   /* x86 instruction crc32 is part of SSE-4.2 */
> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> > +   /* This inline assembly implementation should be equivalent
> > +* to the kernel's crc32c_intel_le_hw() function used by
> > +* crc32c(), but this performs better on test machines.
> > +*/
> > +   while (len > 8) {
> > +   asm volatile(/* 64b quad words */
> > +   "crc32q (%1), %0"
> > +   : "=r" (acc)
> > +   : "r"  (ptr), "0" (acc)
> > +   );
> > +   ptr += 8;
> > +   len -= 8;
> > +   }
> > +
> > +   while (len > 0) {
> > +   asm volatile(/* trailing bytes */
> > +   "crc32b (%1), %0"
> > +   : "=r" (acc)
> > +   : "r"  (ptr), "0" (acc)
> > +   );
> > +   ptr++;
> > +   len--;
> > +   }
> > +
> > +   csum = (u32) acc;
> > +   } else {
> > +   /* The kernel's crc32c() function should also detect and use the
> > +* crc32 instruction of SSE-4.2. But calling in to this function
> > +* is about 3x to 5x slower than the inline assembly version on
> > +* some test machines.
> 
> That is really odd. Did you try to characterize why this is the case? Is
> it purely the overhead of dispatching to the correct backend function?
> That's a rather big performance hit.
> 
> > +*/
> > +   csum = crc32c(crc, data, len);
> > +   }
> > +
> > +   return csum;
> > +}
> > +

Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
that the accelerated version was being called?  Or, perhaps CRC32C_PCL_BREAKEVEN
(defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
don't hack around performance problems like this; if they exist, they need to be
fixed for everyone.

Eric


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Eric Biggers
On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
> maintainer]
> 
> On 10.03.2018 20:17, Andiry Xu wrote:
> 
> 
> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> > +{
> > +   u8 *ptr = (u8 *) data;
> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> > +   u32 csum;
> > +
> > +   /* x86 instruction crc32 is part of SSE-4.2 */
> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> > +   /* This inline assembly implementation should be equivalent
> > +* to the kernel's crc32c_intel_le_hw() function used by
> > +* crc32c(), but this performs better on test machines.
> > +*/
> > +   while (len > 8) {
> > +   asm volatile(/* 64b quad words */
> > +   "crc32q (%1), %0"
> > +   : "=r" (acc)
> > +   : "r"  (ptr), "0" (acc)
> > +   );
> > +   ptr += 8;
> > +   len -= 8;
> > +   }
> > +
> > +   while (len > 0) {
> > +   asm volatile(/* trailing bytes */
> > +   "crc32b (%1), %0"
> > +   : "=r" (acc)
> > +   : "r"  (ptr), "0" (acc)
> > +   );
> > +   ptr++;
> > +   len--;
> > +   }
> > +
> > +   csum = (u32) acc;
> > +   } else {
> > +   /* The kernel's crc32c() function should also detect and use the
> > +* crc32 instruction of SSE-4.2. But calling in to this function
> > +* is about 3x to 5x slower than the inline assembly version on
> > +* some test machines.
> 
> That is really odd. Did you try to characterize why this is the case? Is
> it purely the overhead of dispatching to the correct backend function?
> That's a rather big performance hit.
> 
> > +*/
> > +   csum = crc32c(crc, data, len);
> > +   }
> > +
> > +   return csum;
> > +}
> > +

Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
that the accelerated version was being called?  Or, perhaps CRC32C_PCL_BREAKEVEN
(defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
don't hack around performance problems like this; if they exist, they need to be
fixed for everyone.

Eric


Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Nikolay Borisov
[Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
maintainer]

On 10.03.2018 20:17, Andiry Xu wrote:


> +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> +{
> + u8 *ptr = (u8 *) data;
> + u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> + u32 csum;
> +
> + /* x86 instruction crc32 is part of SSE-4.2 */
> + if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> + /* This inline assembly implementation should be equivalent
> +  * to the kernel's crc32c_intel_le_hw() function used by
> +  * crc32c(), but this performs better on test machines.
> +  */
> + while (len > 8) {
> + asm volatile(/* 64b quad words */
> + "crc32q (%1), %0"
> + : "=r" (acc)
> + : "r"  (ptr), "0" (acc)
> + );
> + ptr += 8;
> + len -= 8;
> + }
> +
> + while (len > 0) {
> + asm volatile(/* trailing bytes */
> + "crc32b (%1), %0"
> + : "=r" (acc)
> + : "r"  (ptr), "0" (acc)
> + );
> + ptr++;
> + len--;
> + }
> +
> + csum = (u32) acc;
> + } else {
> + /* The kernel's crc32c() function should also detect and use the
> +  * crc32 instruction of SSE-4.2. But calling in to this function
> +  * is about 3x to 5x slower than the inline assembly version on
> +  * some test machines.

That is really odd. Did you try to characterize why this is the case? Is
it purely the overhead of dispatching to the correct backend function?
That's a rather big performance hit.

> +  */
> + csum = crc32c(crc, data, len);
> + }
> +
> + return csum;
> +}
> +




Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Nikolay Borisov
[Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
maintainer]

On 10.03.2018 20:17, Andiry Xu wrote:


> +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> +{
> + u8 *ptr = (u8 *) data;
> + u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> + u32 csum;
> +
> + /* x86 instruction crc32 is part of SSE-4.2 */
> + if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> + /* This inline assembly implementation should be equivalent
> +  * to the kernel's crc32c_intel_le_hw() function used by
> +  * crc32c(), but this performs better on test machines.
> +  */
> + while (len > 8) {
> + asm volatile(/* 64b quad words */
> + "crc32q (%1), %0"
> + : "=r" (acc)
> + : "r"  (ptr), "0" (acc)
> + );
> + ptr += 8;
> + len -= 8;
> + }
> +
> + while (len > 0) {
> + asm volatile(/* trailing bytes */
> + "crc32b (%1), %0"
> + : "=r" (acc)
> + : "r"  (ptr), "0" (acc)
> + );
> + ptr++;
> + len--;
> + }
> +
> + csum = (u32) acc;
> + } else {
> + /* The kernel's crc32c() function should also detect and use the
> +  * crc32 instruction of SSE-4.2. But calling in to this function
> +  * is about 3x to 5x slower than the inline assembly version on
> +  * some test machines.

That is really odd. Did you try to characterize why this is the case? Is
it purely the overhead of dispatching to the correct backend function?
That's a rather big performance hit.

> +  */
> + csum = crc32c(crc, data, len);
> + }
> +
> + return csum;
> +}
> +




[RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA stores offset rather than absolute addresses in pmem.
nova_get_block() and nova_get_addr_off() provide transitions
between these two kinds of addresses.

Signed-off-by: Andiry Xu 
---
 fs/nova/nova.h | 299 +
 1 file changed, 299 insertions(+)
 create mode 100644 fs/nova/nova.h

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
new file mode 100644
index 000..5eb696c
--- /dev/null
+++ b/fs/nova/nova.h
@@ -0,0 +1,299 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the NOVA filesystem.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef __NOVA_H
+#define __NOVA_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "nova_def.h"
+
+#define PAGE_SHIFT_2M 21
+#define PAGE_SHIFT_1G 30
+
+
+/*
+ * Debug code
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/* #define nova_dbg(s, args...)pr_debug(s, ## args) */
+#define nova_dbg(s, args ...)  pr_info(s, ## args)
+#define nova_err(sb, s, args ...)  nova_error_mng(sb, s, ## args)
+#define nova_warn(s, args ...) pr_warn(s, ## args)
+#define nova_info(s, args ...) pr_info(s, ## args)
+
+extern unsigned int nova_dbgmask;
+#define NOVA_DBGMASK_MMAPHUGE (0x0001)
+#define NOVA_DBGMASK_MMAP4K   (0x0002)
+#define NOVA_DBGMASK_MMAPVERBOSE   (0x0004)
+#define NOVA_DBGMASK_MMAPVVERBOSE  (0x0008)
+#define NOVA_DBGMASK_VERBOSE  (0x0010)
+#define NOVA_DBGMASK_TRANSACTION   (0x0020)
+
+#define nova_dbg_mmap4k(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAP4K) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapv(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVERBOSE) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapvv(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVVERBOSE) ? nova_dbg(s, args) : 0)
+
+#define nova_dbg_verbose(s, args ...)   \
+   ((nova_dbgmask & NOVA_DBGMASK_VERBOSE) ? nova_dbg(s, ##args) : 0)
+#define nova_dbgv(s, args ...) nova_dbg_verbose(s, ##args)
+#define nova_dbg_trans(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_TRANSACTION) ? nova_dbg(s, ##args) : 0)
+
+#define NOVA_ASSERT(x) do {\
+  if (!(x))\
+  nova_warn("assertion failed %s:%d: 
%s\n", \
+  __FILE__, __LINE__, #x);\
+  } while (0)
+
+#define nova_set_bit  __test_and_set_bit_le
+#define nova_clear_bit__test_and_clear_bit_le
+#define nova_find_next_zero_bit   find_next_zero_bit_le
+
+#define clear_opt(o, opt)  (o &= ~NOVA_MOUNT_ ## opt)
+#define set_opt(o, opt)(o |= NOVA_MOUNT_ ## opt)
+#define test_opt(sb, opt)  (NOVA_SB(sb)->s_mount_opt & NOVA_MOUNT_ ## opt)
+
+#define NOVA_LARGE_INODE_TABLE_SIZE(0x20)
+/* NOVA size threshold for using 2M blocks for inode table */
+#define NOVA_LARGE_INODE_TABLE_THREASHOLD(0x2000)
+/*
+ * nova inode flags
+ *
+ * NOVA_EOFBLOCKS_FL   There are blocks allocated beyond eof
+ */
+#define NOVA_EOFBLOCKS_FL  0x2000
+/* Flags that should be inherited by new inodes from their parent. */
+#define NOVA_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
+   FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \
+   FS_COMPRBLK_FL | FS_NOCOMP_FL | \
+   FS_JOURNAL_DATA_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define NOVA_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define NOVA_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define NOVA_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | NOVA_EOFBLOCKS_FL)
+
+/* IOCTLs */
+#defineNOVA_PRINT_TIMING   0xBCD00010
+#defineNOVA_CLEAR_STATS0xBCD00011
+#defineNOVA_PRINT_LOG  0xBCD00013
+#define

[RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA stores offset rather than absolute addresses in pmem.
nova_get_block() and nova_get_addr_off() provide transitions
between these two kinds of addresses.

Signed-off-by: Andiry Xu 
---
 fs/nova/nova.h | 299 +
 1 file changed, 299 insertions(+)
 create mode 100644 fs/nova/nova.h

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
new file mode 100644
index 000..5eb696c
--- /dev/null
+++ b/fs/nova/nova.h
@@ -0,0 +1,299 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the NOVA filesystem.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef __NOVA_H
+#define __NOVA_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "nova_def.h"
+
+#define PAGE_SHIFT_2M 21
+#define PAGE_SHIFT_1G 30
+
+
+/*
+ * Debug code
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/* #define nova_dbg(s, args...)pr_debug(s, ## args) */
+#define nova_dbg(s, args ...)  pr_info(s, ## args)
+#define nova_err(sb, s, args ...)  nova_error_mng(sb, s, ## args)
+#define nova_warn(s, args ...) pr_warn(s, ## args)
+#define nova_info(s, args ...) pr_info(s, ## args)
+
+extern unsigned int nova_dbgmask;
+#define NOVA_DBGMASK_MMAPHUGE (0x0001)
+#define NOVA_DBGMASK_MMAP4K   (0x0002)
+#define NOVA_DBGMASK_MMAPVERBOSE   (0x0004)
+#define NOVA_DBGMASK_MMAPVVERBOSE  (0x0008)
+#define NOVA_DBGMASK_VERBOSE  (0x0010)
+#define NOVA_DBGMASK_TRANSACTION   (0x0020)
+
+#define nova_dbg_mmap4k(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAP4K) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapv(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVERBOSE) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapvv(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVVERBOSE) ? nova_dbg(s, args) : 0)
+
+#define nova_dbg_verbose(s, args ...)   \
+   ((nova_dbgmask & NOVA_DBGMASK_VERBOSE) ? nova_dbg(s, ##args) : 0)
+#define nova_dbgv(s, args ...) nova_dbg_verbose(s, ##args)
+#define nova_dbg_trans(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_TRANSACTION) ? nova_dbg(s, ##args) : 0)
+
+#define NOVA_ASSERT(x) do {\
+  if (!(x))\
+  nova_warn("assertion failed %s:%d: 
%s\n", \
+  __FILE__, __LINE__, #x);\
+  } while (0)
+
+#define nova_set_bit  __test_and_set_bit_le
+#define nova_clear_bit__test_and_clear_bit_le
+#define nova_find_next_zero_bit   find_next_zero_bit_le
+
+#define clear_opt(o, opt)  (o &= ~NOVA_MOUNT_ ## opt)
+#define set_opt(o, opt)(o |= NOVA_MOUNT_ ## opt)
+#define test_opt(sb, opt)  (NOVA_SB(sb)->s_mount_opt & NOVA_MOUNT_ ## opt)
+
+#define NOVA_LARGE_INODE_TABLE_SIZE(0x20)
+/* NOVA size threshold for using 2M blocks for inode table */
+#define NOVA_LARGE_INODE_TABLE_THREASHOLD(0x2000)
+/*
+ * nova inode flags
+ *
+ * NOVA_EOFBLOCKS_FL   There are blocks allocated beyond eof
+ */
+#define NOVA_EOFBLOCKS_FL  0x2000
+/* Flags that should be inherited by new inodes from their parent. */
+#define NOVA_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
+   FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \
+   FS_COMPRBLK_FL | FS_NOCOMP_FL | \
+   FS_JOURNAL_DATA_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define NOVA_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define NOVA_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define NOVA_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | NOVA_EOFBLOCKS_FL)
+
+/* IOCTLs */
+#defineNOVA_PRINT_TIMING   0xBCD00010
+#defineNOVA_CLEAR_STATS0xBCD00011
+#defineNOVA_PRINT_LOG  0xBCD00013
+#defineNOVA_PRINT_LOG_BLOCKNODE0xBCD00014
+#defineNOVA_PRINT_LOG_PAGES