[V2][PATCH] vt: keyboard, fix uninitialized variables warning

2021-03-03 Thread Li Wang
drivers/tty/vt/keyboard.c: In function 'vt_do_kdgkb_ioctl':
drivers/tty/vt/keyboard.c: warning: 'ret' may be used uninitialized in this 
function [-Wmaybe-uninitialized]
  return ret;
 ^~~
drivers/tty/vt/keyboard.c: warning: 'kbs' may be used uninitialized in this 
function [-Wmaybe-uninitialized]
  kfree(kbs);
  ^~~~~~

Signed-off-by: Li Wang 
---
 drivers/tty/vt/keyboard.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 7763862..62f1ecb 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -2090,6 +2090,8 @@ int vt_do_kdgkb_ioctl(int cmd, struct kbsentry __user 
*user_kdgkb, int perm)
 
ret = 0;
break;
+   default:
+   return -EINVAL;
}
 
kfree(kbs);
-- 
2.7.4



[PATCH] vt: keyboard, fix uninitialized variables warning

2021-03-03 Thread Li Wang
drivers/tty/vt/keyboard.c: In function 'vt_do_kdgkb_ioctl':
drivers/tty/vt/keyboard.c: warning: 'ret' may be used uninitialized in this 
function [-Wmaybe-uninitialized]
  return ret;
 ^~~
kernel-source/drivers/tty/vt/keyboard.c: warning: 'kbs' may be used 
uninitialized in this function [-Wmaybe-uninitialized]
  kfree(kbs);
  ^~~~~~

Signed-off-by: Li Wang 
---
 drivers/tty/vt/keyboard.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 7763862..3e73d55 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -2049,8 +2049,8 @@ int vt_do_kdgkb_ioctl(int cmd, struct kbsentry __user 
*user_kdgkb, int perm)
 {
unsigned char kb_func;
unsigned long flags;
-   char *kbs;
-   int ret;
+   char *kbs = NULL;
+   int ret = -EINVAL;
 
if (get_user(kb_func, &user_kdgkb->kb_func))
return -EFAULT;
-- 
2.7.4



[PATCH] vhost: reduce stack usage in log_used

2020-09-14 Thread Li Wang
Fix the warning: [-Werror=-Wframe-larger-than=]

drivers/vhost/vhost.c: In function log_used:
drivers/vhost/vhost.c:1906:1:
warning: the frame size of 1040 bytes is larger than 1024 bytes

Signed-off-by: Li Wang 
---
 drivers/vhost/vhost.c | 2 +-
 drivers/vhost/vhost.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index b45519c..31837a5
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1884,7 +1884,7 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 
hva, u64 len)
 
 static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
 {
-   struct iovec iov[64];
+   struct iovec *iov = vq->log_iov;
int i, ret;
 
if (!vq->iotlb)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 9032d3c..5fe4b47
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -123,6 +123,7 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
+   struct iovec log_iov[64];
 
/* Ring endianness. Defaults to legacy native endianness.
 * Set to true when starting a modern virtio device. */
-- 
2.7.4



[PATCH] vhost: reduce stack usage in log_used

2020-09-11 Thread Li Wang
Fix the warning: [-Werror=-Wframe-larger-than=]

drivers/vhost/vhost.c: In function log_used:
drivers/vhost/vhost.c:1906:1:
warning: the frame size of 1040 bytes is larger than 1024 bytes

Signed-off-by: Li Wang 
---
 drivers/vhost/vhost.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index b45519c..41769de 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1884,25 +1884,31 @@ static int log_write_hva(struct vhost_virtqueue *vq, 
u64 hva, u64 len)
 
 static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
 {
-   struct iovec iov[64];
+   struct iovec *iov;
int i, ret;
 
if (!vq->iotlb)
return log_write(vq->log_base, vq->log_addr + used_offset, len);
 
+   iov = kcalloc(64, sizeof(*iov), GFP_KERNEL);
+   if (!iov)
+   return -ENOMEM;
+
ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
 len, iov, 64, VHOST_ACCESS_WO);
if (ret < 0)
-   return ret;
+   goto out;
 
for (i = 0; i < ret; i++) {
ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
iov[i].iov_len);
if (ret)
-   return ret;
+   goto out;
}
 
-   return 0;
+out:
+   kfree(iov);
+   return ret;
 }
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
-- 
2.7.4



[5.3.0-rc4 Bug] WARNING: CPU: 17 PID: 25085 at lib/list_debug.c:47 __list_del_entry_valid+0x4e/0x90

2019-08-21 Thread Li Wang
 003fba9d2000 CR4: 000406e0
[  119.912293] Call Trace:
[  119.924344]  lpfc_sli4_queue_destroy+0x11a/0x390 [lpfc]
[  119.949270]  lpfc_pci_remove_one+0x7d6/0x970 [lpfc]
[  119.976858]  pci_device_shutdown+0x34/0x60
[  119.996353]  device_shutdown+0x160/0x1c0
[  120.015045]  kernel_restart+0xe/0x30
[  120.033515]  __do_sys_reboot+0x1cf/0x210
[  120.054274]  ? __fput+0x168/0x250
[  120.070250]  ? syscall_trace_enter+0x198/0x2c0
[  120.091719]  ? __audit_syscall_exit+0x249/0x2a0
[  120.115046]  do_syscall_64+0x59/0x1e0
[  120.135104]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  120.158848] RIP: 0033:0x7f7f6f48c427
[  120.175870] Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00
00 00 90 f3 0f 1e fa 89 fa be 69 19 12 28 bf ad de e1 fe b8 a9 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 31 9a 2c 00 f7 d8 64 89
02 b8
[  120.279588] RSP: 002b:7fffe3e36288 EFLAGS: 0246 ORIG_RAX:
00a9
[  120.321551] RAX: ffda RBX:  RCX: 7f7f6f48c427
[  120.361854] RDX: 01234567 RSI: 28121969 RDI: fee1dead
[  120.398536] RBP: 7fffe3e362d0 R08: 0002 R09: 
[  120.436702] R10: 004b R11: 0246 R12: 0001
[  120.470997] R13: fffe R14: 0006 R15: 
[  120.508557] Modules linked in: sunrpc amd64_edac_mod edac_mce_amd
kvm_amd ccp kvm irqbypass ipmi_ssif crct10dif_pclmul crc32_pclmul
sp5100_tco ipmi_si joydev ghash_clmulni_intel pcspkr i2c_piix4 hpwdt
sg fam15h_power ipmi_devintf k10temp hpilo ipmi_msghandler
acpi_power_meter xfs libcrc32c radeon i2c_algo_bit drm_kms_helper
syscopyarea sysfillrect sysimgblt fb_sys_fops lpfc sd_mod ttm ahci
nvmet_fc nvmet libahci ata_generic drm nvme_fc libata nvme_fabrics
netxen_nic hpsa nvme_core crc32c_intel scsi_transport_fc serio_raw
scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod


--
Regards,
Li Wang


Re: ltp/read_all_sys (read_all -d /sys -q -r 10) cause system panic with kernel-4.18.0-rc1

2018-07-04 Thread Li Wang
0b8


On Tue, Jun 19, 2018 at 6:41 PM, Li Wang  wrote:
> Hi,
>
> I'm hitting this panic when running ltp/read_all_sys on kernel-v4.18-rc1.
>
> Test env:
> FUJITSU PRIMERGY RX200 S6 GS01
> Intel(R) Xeon(R) CPU E5620 @ 2.40GHz
> 16384 MB memory, 598 GB disk space
>
>
> [ 5915.705844] BUG: unable to handle kernel NULL pointer dereference
> at 00b8
> [ 5915.714587] PGD 80042bcf7067 P4D 80042bcf7067 PUD 423f4e067 PMD 0
> [ 5915.722254] Oops:  [#1] SMP PTI
> [ 5915.726147] CPU: 6 PID: 18535 Comm: read_all Tainted: P
> IOE 4.18.0-rc1 #1
> [ 5915.734980] Hardware name: FUJITSU
> PRIMERGY RX200 S6 /D3031, BIOS 6.00 Rev. 1.10.3031
>  01/20/2012
> [ 5915.749654] RIP: 0010:qla_dfs_tgt_counters_show+0x92/0x2a0 [qla2xxx]
> [ 5915.756733] Code: b6 86 22 01 00 00 66 85 c0 74 63 83 e8 01 4c 8b
> 9e b8 00 00 00 31 f6 0f b7 c0 48 8d 3c c5 08 00 00 00 49 8b 04 33 48
> 83 c6 08 <48> 03 90 b8 00 00 00 48 03 88 c0 00 00 00 4c 03 80 c8 00 00
> 00 4c
> [ 5915.777816] RSP: 0018:af04109e3d60 EFLAGS: 00010202
> [ 5915.783645] RAX:  RBX:  RCX: 
> 
> [ 5915.791606] RDX:  RSI: 0008 RDI: 
> 0040
> [ 5915.799568] RBP:  R08:  R09: 
> 
> [ 5915.807529] R10: 956823a74798 R11: 956824a29000 R12: 
> 
> [ 5915.815489] R13:  R14: 9567badfc280 R15: 
> 
> [ 5915.823451] FS:  7f27336a1740() GS:95683fd8()
> knlGS:
> [ 5915.832479] CS:  0010 DS:  ES:  CR0: 80050033
> [ 5915.838890] CR2: 00b8 CR3: 00042960a003 CR4: 
> 000206e0
> [ 5915.846850] Call Trace:
> [ 5915.849583]  ? __kmalloc_node+0x195/0x280
> [ 5915.854056]  ? seq_read+0x33e/0x3f0
> [ 5915.857946]  seq_read+0x120/0x3f0
> [ 5915.861643]  full_proxy_read+0x50/0x70
> [ 5915.865827]  __vfs_read+0x36/0x190
> [ 5915.869622]  vfs_read+0x87/0x130
> [ 5915.873223]  ksys_read+0x52/0xc0
> [ 5915.876823]  do_syscall_64+0x5b/0x180
> [ 5915.880910]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [ 5915.886547] RIP: 0033:0x7f2733280790
> [ 5915.890532] Code: 73 01 c3 48 8b 0d 18 88 20 00 f7 d8 64 89 01 48
> 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 cc 20 00 00 75 10 b8 00 00 00
> 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e fc ff ff 48 89
> 04 24
> [ 5915.911617] RSP: 002b:7ffef181c738 EFLAGS: 0246 ORIG_RAX:
> 
> [ 5915.920064] RAX: ffda RBX: 0006 RCX: 
> 7f2733280790
> [ 5915.928025] RDX: 03ff RSI: 7ffef181cbf0 RDI: 
> 0006
> [ 5915.935986] RBP: 0b7b R08:  R09: 
> 7ffef181c690
> [ 5915.943949] R10:  R11: 0246 R12: 
> 7f2733688000
> [ 5915.951909] R13: 7ffef181cbf0 R14: 0028 R15: 
> 0030
> [ 5915.959871] Modules linked in: dummy veth binfmt_misc sctp overlay
> tun fuse vfat fat btrfs xor zstd_decompress zstd_compress xxhash
> raid6_pq ext4 mbcache jbd2 loop sunrpc intel_powerclamp coretemp
> kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul
> ghash_clmulni_intel pcbc ipmi_ssif aesni_intel crypto_simd iTCO_wdt
> ipmi_si cryptd iTCO_vendor_support glue_helper gpio_ich ipmi_devintf
> sg acpi_power_meter ipmi_msghandler i2c_i801 pcspkr lpc_ich
> i7core_edac acpi_cpufreq ip_tables xfs libcrc32c sd_mod sr_mod cdrom
> mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops
> ttm ata_generic pata_acpi qla2xxx drm igb ata_piix nvme_fc mptsas
> libata nvme_fabrics scsi_transport_sas dca crc32c_intel mptscsih
> i2c_algo_bit nvme_core i2c_core mptbase scsi_transport_fc dm_mirror
> dm_region_hash dm_log
> [ 5916.038566]  dm_mod [last unloaded: ltp_insmod01]
> [ 5916.043814] CR2: 00b8
> [ 5916.047513] BUG: unable to handle kernel NULL pointer dereference
> at 00b8
> [ 5916.047537] ---[ end trace 1dddacfb06305174 ]---
>
>
> --
> Regards,
> Li Wang



-- 
Regards,
Li Wang


Re: [PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink

2018-06-25 Thread Li Wang
On 30 May 2018 at 20:53, Dan Streetman  wrote:
> On Wed, May 30, 2018 at 6:39 AM, Li Wang  wrote:
>> The '/sys/../zswap/stored_pages:' keep raising in zswap test with
>> "zswap.max_pool_percent=0" parameter. But theoretically, it should
>> not compress or store pages any more since there is no space in
>> compressed pool.
>>
>> Reproduce steps:
>>   1. Boot kernel with "zswap.enabled=1"
>>   2. Set the max_pool_percent to 0
>>   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
>>   3. Do memory stress test to see if some pages have been compressed
>>   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
>>   4. Watching the 'stored_pages' number increasing or not
>>
>> The root cause is:
>>   When zswap_max_pool_percent is setting to 0 via kernel parameter, the
>>   zswap_is_full() will always return true to do zswap_shrink(). But if
>>   the shinking is able to reclain a page successful, then proceeds to
>>   compress/store another page, so the value of stored_pages will keep
>>   changing.
>>
>> To solve the issue, this patch adds zswap_is_full() check again after
>> zswap_shrink() to make sure it's now under the max_pool_percent, and
>> not to compress/store if reach its limitaion.
>>
>> Signed-off-by: Li Wang 
>
> Acked-by: Dan Streetman 

ping~

Any possible to merge this in kernel-4.18-rcX? My zswap test always
fails on the upstream kernel.


-- 
Regards,
Li Wang
Email: wangli.a...@gmail.com


ltp/read_all_sys (read_all -d /sys -q -r 10) cause system panic with kernel-4.18.0-rc1

2018-06-19 Thread Li Wang
Hi,

I'm hitting this panic when running ltp/read_all_sys on kernel-v4.18-rc1.

Test env:
FUJITSU PRIMERGY RX200 S6 GS01
Intel(R) Xeon(R) CPU E5620 @ 2.40GHz
16384 MB memory, 598 GB disk space


[ 5915.705844] BUG: unable to handle kernel NULL pointer dereference
at 00b8
[ 5915.714587] PGD 80042bcf7067 P4D 80042bcf7067 PUD 423f4e067 PMD 0
[ 5915.722254] Oops:  [#1] SMP PTI
[ 5915.726147] CPU: 6 PID: 18535 Comm: read_all Tainted: P
IOE 4.18.0-rc1 #1
[ 5915.734980] Hardware name: FUJITSU
PRIMERGY RX200 S6 /D3031, BIOS 6.00 Rev. 1.10.3031
 01/20/2012
[ 5915.749654] RIP: 0010:qla_dfs_tgt_counters_show+0x92/0x2a0 [qla2xxx]
[ 5915.756733] Code: b6 86 22 01 00 00 66 85 c0 74 63 83 e8 01 4c 8b
9e b8 00 00 00 31 f6 0f b7 c0 48 8d 3c c5 08 00 00 00 49 8b 04 33 48
83 c6 08 <48> 03 90 b8 00 00 00 48 03 88 c0 00 00 00 4c 03 80 c8 00 00
00 4c
[ 5915.777816] RSP: 0018:af04109e3d60 EFLAGS: 00010202
[ 5915.783645] RAX:  RBX:  RCX: 
[ 5915.791606] RDX:  RSI: 0008 RDI: 0040
[ 5915.799568] RBP:  R08:  R09: 
[ 5915.807529] R10: 956823a74798 R11: 956824a29000 R12: 
[ 5915.815489] R13:  R14: 9567badfc280 R15: 
[ 5915.823451] FS:  7f27336a1740() GS:95683fd8()
knlGS:
[ 5915.832479] CS:  0010 DS:  ES:  CR0: 80050033
[ 5915.838890] CR2: 00b8 CR3: 00042960a003 CR4: 000206e0
[ 5915.846850] Call Trace:
[ 5915.849583]  ? __kmalloc_node+0x195/0x280
[ 5915.854056]  ? seq_read+0x33e/0x3f0
[ 5915.857946]  seq_read+0x120/0x3f0
[ 5915.861643]  full_proxy_read+0x50/0x70
[ 5915.865827]  __vfs_read+0x36/0x190
[ 5915.869622]  vfs_read+0x87/0x130
[ 5915.873223]  ksys_read+0x52/0xc0
[ 5915.876823]  do_syscall_64+0x5b/0x180
[ 5915.880910]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 5915.886547] RIP: 0033:0x7f2733280790
[ 5915.890532] Code: 73 01 c3 48 8b 0d 18 88 20 00 f7 d8 64 89 01 48
83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 cc 20 00 00 75 10 b8 00 00 00
00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e fc ff ff 48 89
04 24
[ 5915.911617] RSP: 002b:7ffef181c738 EFLAGS: 0246 ORIG_RAX:

[ 5915.920064] RAX: ffda RBX: 0006 RCX: 7f2733280790
[ 5915.928025] RDX: 03ff RSI: 7ffef181cbf0 RDI: 0006
[ 5915.935986] RBP: 0b7b R08:  R09: 7ffef181c690
[ 5915.943949] R10:  R11: 0246 R12: 7f2733688000
[ 5915.951909] R13: 7ffef181cbf0 R14: 0028 R15: 0030
[ 5915.959871] Modules linked in: dummy veth binfmt_misc sctp overlay
tun fuse vfat fat btrfs xor zstd_decompress zstd_compress xxhash
raid6_pq ext4 mbcache jbd2 loop sunrpc intel_powerclamp coretemp
kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul
ghash_clmulni_intel pcbc ipmi_ssif aesni_intel crypto_simd iTCO_wdt
ipmi_si cryptd iTCO_vendor_support glue_helper gpio_ich ipmi_devintf
sg acpi_power_meter ipmi_msghandler i2c_i801 pcspkr lpc_ich
i7core_edac acpi_cpufreq ip_tables xfs libcrc32c sd_mod sr_mod cdrom
mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops
ttm ata_generic pata_acpi qla2xxx drm igb ata_piix nvme_fc mptsas
libata nvme_fabrics scsi_transport_sas dca crc32c_intel mptscsih
i2c_algo_bit nvme_core i2c_core mptbase scsi_transport_fc dm_mirror
dm_region_hash dm_log
[ 5916.038566]  dm_mod [last unloaded: ltp_insmod01]
[ 5916.043814] CR2: 00b8
[ 5916.047513] BUG: unable to handle kernel NULL pointer dereference
at 00b8
[ 5916.047537] ---[ end trace 1dddacfb06305174 ]---


-- 
Regards,
Li Wang


[PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink

2018-05-30 Thread Li Wang
The '/sys/../zswap/stored_pages:' keep raising in zswap test with
"zswap.max_pool_percent=0" parameter. But theoretically, it should
not compress or store pages any more since there is no space in
compressed pool.

Reproduce steps:
  1. Boot kernel with "zswap.enabled=1"
  2. Set the max_pool_percent to 0
  # echo 0 > /sys/module/zswap/parameters/max_pool_percent
  3. Do memory stress test to see if some pages have been compressed
  # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
  4. Watching the 'stored_pages' number increasing or not

The root cause is:
  When zswap_max_pool_percent is setting to 0 via kernel parameter, the
  zswap_is_full() will always return true to do zswap_shrink(). But if
  the shinking is able to reclain a page successful, then proceeds to
  compress/store another page, so the value of stored_pages will keep
  changing.

To solve the issue, this patch adds zswap_is_full() check again after
zswap_shrink() to make sure it's now under the max_pool_percent, and
not to compress/store if reach its limitaion.

Signed-off-by: Li Wang 
Cc: Seth Jennings 
Cc: Dan Streetman 
Cc: Huang Ying 
Cc: Yu Zhao 
---
 mm/zswap.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/mm/zswap.c b/mm/zswap.c
index 61a5c41..fd320c3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
ret = -ENOMEM;
goto reject;
}
+
+   /* A second zswap_is_full() check after
+* zswap_shrink() to make sure it's now
+* under the max_pool_percent
+*/
+   if (zswap_is_full()) {
+   ret = -ENOMEM;
+   goto reject;
+   }
}
 
/* allocate entry */
-- 
2.9.5



[PATCH RFC] zswap: reject to compress/store page if zswap_max_pool_percent is 0

2018-05-24 Thread Li Wang
The '/sys/../zswap/stored_pages:' keep raising in zswap test with
"zswap.max_pool_percent=0" parameter. But theoretically, it should
not compress or store pages any more since there is no space for
compressed pool.

Reproduce steps:

  1. Boot kernel with "zswap.enabled=1 zswap.max_pool_percent=17"
  2. Set the max_pool_percent to 0
  # echo 0 > /sys/module/zswap/parameters/max_pool_percent
 Confirm this parameter works fine
  # cat /sys/kernel/debug/zswap/pool_total_size
  0
  3. Do memory stress test to see if some pages have been compressed
  # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
 Watching the 'stored_pages' numbers increasing or not

The root cause is:

  When the zswap_max_pool_percent is set to 0 via kernel parameter, the 
zswap_is_full()
  will always return true to shrink the pool size by zswap_shrink(). If the 
pool size
  has been shrinked a little success, zswap will do compress/store pages again. 
Then we
  get fails on that as above.

Signed-off-by: Li Wang 
Cc: Seth Jennings 
Cc: Dan Streetman 
Cc: Huang Ying 
Cc: Yu Zhao 
---
 mm/zswap.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/mm/zswap.c b/mm/zswap.c
index 61a5c41..2b537bb 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1007,6 +1007,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
 
+   if (!zswap_max_pool_percent) {
+   ret = -ENOMEM;
+   goto reject;
+   }
+
/* THP isn't supported */
if (PageTransHuge(page)) {
ret = -EINVAL;
-- 
2.9.5



linux/drivers/cpuidle: cpuidle_enter_state() issue

2018-02-06 Thread Li Wang
Hi Kernel-developers,

The flowing call trace was catch from kernel-v4.15, could anyone help
to analysis the cpuidle problem?
or, if you need any more detail info pls let me know.

Test Env:
IBM KVM Guest on ibm-p8-kvm-03
POWER8E (raw), altivec supported
9216 MB memory, 107 GB disk space

8<
[15002.722413] swapper/15: page allocation failure: order:0,
mode:0x1080020(GFP_ATOMIC), nodemask=(null)
[15002.853793] swapper/15 cpuset=/ mems_allowed=0
[15002.853932] CPU: 15 PID: 0 Comm: swapper/15 Not tainted 4.15.0 #1
[15002.854019] Call Trace:
[15002.854129] [c0023ff77650] [c0940b50]
.dump_stack+0xac/0xfc (unreliable)
[15002.854285] [c0023ff776e0] [c026c678] .warn_alloc+0xe8/0x180
[15002.854376] [c0023ff777a0] [c026d50c]
.__alloc_pages_nodemask+0xd6c/0xf90
[15002.854490] [c0023ff77980] [c02e9cc0]
.alloc_pages_current+0x90/0x120
[15002.854624] [c0023ff77a10] [c07990cc]
.skb_page_frag_refill+0x8c/0x120
[15002.854746] [c0023ff77aa0] [d3a561a8]
.try_fill_recv+0x368/0x620 [virtio_net]
[15003.422855] [c0023ff77ba0] [d3a568ec]
.virtnet_poll+0x25c/0x380 [virtio_net]
[15003.423864] [c0023ff77c70] [c07c18d0] .net_rx_action+0x330/0x4a0
[15003.424024] [c0023ff77d90] [c0960d50] .__do_softirq+0x150/0x3a8
[15003.424197] [c0023ff77e90] [c00ff608] .irq_exit+0x198/0x1b0
[15003.424342] [c0023ff77f10] [c0015504] .__do_irq+0x94/0x1f0
[15003.424485] [c0023ff77f90] [c0026d5c] .call_do_irq+0x14/0x24
[15003.424627] [c0023bc63820] [c00156ec] .do_IRQ+0x8c/0x100
[15003.424776] [c0023bc638c0] [c0008b34]
hardware_interrupt_common+0x114/0x120
[15003.424963] --- interrupt: 501 at .snooze_loop+0xa4/0x1c0
LR = .snooze_loop+0x60/0x1c0
[15003.425164] [c0023bc63bb0] [c0023bc63c50]
0xc0023bc63c50 (unreliable)
[15003.425346] [c0023bc63c30] [c075104c]
.cpuidle_enter_state+0xac/0x390
[15003.425534] [c0023bc63ce0] [c0157adc] .call_cpuidle+0x3c/0x70
[15003.425669] [c0023bc63d50] [c0157e90] .do_idle+0x2a0/0x300
[15003.425815] [c0023bc63e20] [c01580ac]
.cpu_startup_entry+0x2c/0x40
[15003.425995] [c0023bc63ea0] [c0045790]
.start_secondary+0x4d0/0x520
[15003.426170] [c0023bc63f90] [c000aa70]
start_secondary_prolog+0x10/0x14
-8<---

Any response will be appreciated!

-- 
Regards,
Li Wang
Email: wangli.a...@gmail.com


Re: [PATCH] s390/mm: return -ENOMEM in arch_get_unmapped_area[_topdown]

2017-11-08 Thread Li Wang
On Thu, Oct 26, 2017 at 6:16 PM, Martin Schwidefsky
 wrote:
> On Thu, 26 Oct 2017 17:47:39 +0800
> Li Wang  wrote:
>
>> On Thu, Oct 26, 2017 at 5:26 PM, Martin Schwidefsky
>>  wrote:
>> > On Thu, 26 Oct 2017 15:36:10 +0800
>> > Li Wang  wrote:
>> >
>> > The code in mmap.c checks for the per-task limit, 31-bit vs 64-bit.
>> > pgalloc.c checks for the maximum allowed address and does not care
>> > about the task.
>> >
>> >> Fixes: 8ab867cb0806 (s390/mm: fix BUG_ON in crst_table_upgrade)
>> >> Signed-off-by: Li Wang 
>> >
>> > I don't think this patch fixes anything.
>>
>> At least there is a logic error i think, after apply the patch
>> "s390/mm: fix BUG_ON in crst_table_upgrade",
>> it makes no sense to compare "if (end >= TASK_SIZE_MAX) return
>> -ENOMEM" in crst_table_upgrade() function.
>>
>> isn't it?
>
> Be careful with TASK_SIZE vs. TASK_SIZE_MAX. They return different
> values for 31-bit compat tasks.

what do you think this reproducer now failed(mmap into high region
succeeded) on the latest kernel?
should we enlarge the HIGH_ADDR to -PAGE_SIZE?

#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define HIGH_ADDR (void *)(1L << 53)

int main(void)
{

void *addr;
long map_sz = getpagesize();
int fd = open("testfile", O_RDWR | O_CREAT, 0666);

/* Attempt to mmap into highmem addr, should get ENOMEM */
addr = mmap(HIGH_ADDR, map_sz, PROT_READ,
MAP_SHARED | MAP_FIXED, fd, 0);
if (addr != MAP_FAILED) {
printf("FAIL: mmap into high region succeeded unexpectedly\n");
munmap(addr, map_sz);
close(fd);
}

if (errno != ENOMEM) {
printf("FAIL: mmap into high region failed unexpectedly -
expect errno=ENOMEM, got\n");
} else {
printf("PASS: mmap into high region failed as expected\n");
}

return 0;
}



>
> If the addr parameter is correctly aligned then the if condition in
> crst_table_upgrade is superfluous as TASK_SIZE_MAX is now -PAGE_SIZE
> with the introduction of 5 level page tables. It is important for older
> kernels with only 4 level page tables with a TASK_SIZE_MAX of 2**53.
>
> On the other hand if addr is ever a value between -PAGE_SIZE and -1
> we would end up with an endless loop. That makes the if condition a
> safe-guard and I would like to keep it.
>
> --
> blue skies,
>Martin.
>
> "Reality continues to ruin my life." - Calvin.
>



-- 
Li Wang
liw...@redhat.com


Re: [PATCH] s390/mm: return -ENOMEM in arch_get_unmapped_area[_topdown]

2017-10-26 Thread Li Wang
On Thu, Oct 26, 2017 at 5:26 PM, Martin Schwidefsky
 wrote:
> On Thu, 26 Oct 2017 15:36:10 +0800
> Li Wang  wrote:
>
>> That would be very hard to get -ENOMEM returned in crst_table_upgrade()
>> because the condition(addr + len <= TASK_SIZE) makes all 'end' value
>> is smaller/equal than 'TASK_SIZE_TASK'. So let's move it to the upper
>> layer.
>
> I have a hard time understanding what scenario you describe. There is no
> 'TASK_SIZE_TASK', only TASK_SIZE, TASK_SIZE_OF and TASK_SIZE_MAX.

Sorry for the typo, I was thinking about to write TASK_SIZE_MAX.

>
> The code in mmap.c checks for the per-task limit, 31-bit vs 64-bit.
> pgalloc.c checks for the maximum allowed address and does not care
> about the task.
>
>> Fixes: 8ab867cb0806 (s390/mm: fix BUG_ON in crst_table_upgrade)
>> Signed-off-by: Li Wang 
>
> I don't think this patch fixes anything.

At least there is a logic error i think, after apply the patch
"s390/mm: fix BUG_ON in crst_table_upgrade",
it makes no sense to compare "if (end >= TASK_SIZE_MAX) return
-ENOMEM" in crst_table_upgrade() function.

isn't it?


Thanks for reviewing quick.


-- 
Li Wang
liw...@redhat.com


[PATCH] s390/mm: return -ENOMEM in arch_get_unmapped_area[_topdown]

2017-10-26 Thread Li Wang
That would be very hard to get -ENOMEM returned in crst_table_upgrade()
because the condition(addr + len <= TASK_SIZE) makes all 'end' value
is smaller/equal than 'TASK_SIZE_TASK'. So let's move it to the upper
layer.

Fixes: 8ab867cb0806 (s390/mm: fix BUG_ON in crst_table_upgrade)
Signed-off-by: Li Wang 
---
 arch/s390/mm/mmap.c| 6 ++
 arch/s390/mm/pgalloc.c | 3 +--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 5bea139..8ddb13a 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -119,6 +119,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long 
addr,
return addr;
 
 check_asce_limit:
+   if (addr + len >= TASK_SIZE_MAX)
+   return -ENOMEM;
+
if (addr + len > current->mm->context.asce_limit &&
addr + len <= TASK_SIZE) {
rc = crst_table_upgrade(mm, addr + len);
@@ -184,6 +187,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const 
unsigned long addr0,
}
 
 check_asce_limit:
+   if (addr + len >= TASK_SIZE_MAX)
+   return -ENOMEM;
+
if (addr + len > current->mm->context.asce_limit &&
addr + len <= TASK_SIZE) {
rc = crst_table_upgrade(mm, addr + len);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 05f1f27..5e4b887 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -84,8 +84,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long 
end)
 
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
-   if (end >= TASK_SIZE_MAX)
-   return -ENOMEM;
+
rc = 0;
notify = 0;
while (mm->context.asce_limit < end) {
-- 
2.9.3



[BUG] Unable to handle kernel paging request for unaligned access at address 0xc0000001c52c53df

2017-06-06 Thread Li Wang
Hi,

ltp/access04 always panic the latest mainstream kernel-4.12-rc4 on
ppc64le. From the calltrace
I guess the reason is probably that the tests mount ext2 file system
using ext4 driver.

A simple way to reproduce:

# dd of=wangli if=/dev/zero count=1024 bs=1024
# mkfs -t ext2 wangli
# mount -t ext4 wangli /mnt/


Are there any new changes in ext4 (on kernel-4.12-rc4) recently?


[  318.557844] EXT4-fs (loop0): mounting ext2 file system using the
ext4 subsystem
[  318.558104] Unable to handle kernel paging request for unaligned
access at address 0xc001c52c53df
[  318.558109] Faulting instruction address: 0xc0918b28
[  318.558114] Oops: Kernel access of bad area, sig: 7 [#1]
[  318.558117] SMP NR_CPUS=2048
[  318.558117] NUMA
[  318.558120] pSeries
[  318.558124] Modules linked in: ext4 jbd2 mbcache loop
rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache sg pseries_rng nfsd
auth_rpcgss nfs_acl lockd ghash_generic gf128mul xts vmx_crypto grace
sunrpc ip_tables xfs libcrc32c sd_mod ibmvscsi ibmveth
scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod
[  318.558152] CPU: 2 PID: 40748 Comm: access04 Not tainted 4.12.0-rc4 #1
[  318.558155] task: c003889fb200 task.stack: c003ac134000
[  318.558158] NIP: c0918b28 LR: c011c5d4 CTR: c0130900
[  318.558162] REGS: c003ac137420 TRAP: 0600   Not tainted  (4.12.0-rc4)
[  318.558164] MSR: 80010280b033 
[  318.558171]   CR: 28028842  XER: 
[  318.558174] CFAR: c011c5d0 DAR: c001c52c53df DSISR:
 SOFTE: 0
[  318.558174] GPR00: c011c5d4 c003ac1376a0
c1049000 c001c52c53df
[  318.558174] GPR04: c004788657f0 
 0001
[  318.558174] GPR08: 000477be 
8002 
[  318.558174] GPR12: c0130900 cfac1500
 c004648b6800
[  318.558174] GPR16:  c00408ad0400
 00040001
[  318.558174] GPR20: 0001 
4000 c0cc5780
[  318.558174] GPR24: 0001c45ffc5f 
c0cc5780 c001c52c53df
[  318.558174] GPR28: c9d06034 0004
0800 c001c52c53df
[  318.558222] NIP [c0918b28] _raw_spin_lock+0x28/0xc0
[  318.558226] LR [c011c5d4] try_to_wake_up+0x1f4/0x5b0
[  318.558229] Call Trace:
[  318.558231] [c003ac1376a0] [c9d06034]
0xc9d06034 (unreliable)
[  318.558236] [c003ac1376d0] [c011c5d4] try_to_wake_up+0x1f4/0x5b0
[  318.558241] [c003ac137750] [c0102828] create_worker+0x148/0x250
[  318.558245] [c003ac1377f0] [c01059dc]
alloc_unbound_pwq+0x3bc/0x4c0
[  318.558249] [c003ac137850] [c010601c]
apply_wqattrs_prepare+0x2ac/0x320
[  318.558253] [c003ac1378c0] [c01060cc]
apply_workqueue_attrs_locked+0x3c/0xa0
[  318.558257] [c003ac1378f0] [c010662c]
apply_workqueue_attrs+0x4c/0x80
[  318.558261] [c003ac137930] [c01081cc]
__alloc_workqueue_key+0x16c/0x4e0
[  318.558280] [c003ac1379f0] [d8455ca0]
ext4_fill_super+0x1c70/0x3390 [ext4]
[  318.558286] [c003ac137b30] [c0316bdc] mount_bdev+0x21c/0x250
[  318.558298] [c003ac137bd0] [d844db20] ext4_mount+0x20/0x40 [ext4]
[  318.558303] [c003ac137bf0] [c0318184] mount_fs+0x74/0x210
[  318.558307] [c003ac137ca0] [c033fd18] vfs_kern_mount+0x68/0x1d0
[  318.558310] [c003ac137d10] [c0344a28] do_mount+0x278/0xef0
[  318.558314] [c003ac137de0] [c0345ac4] SyS_mount+0x94/0x100
[  318.558319] [c003ac137e30] [c000af84] system_call+0x38/0xe0
[  318.558322] Instruction dump:
[  318.558324] 990d02bc 4bc8 3c4c0073 38420500 7c0802a6 fbe1fff8
7c7f1b78 f8010010
[  318.558329] f821ffd1 3940 994d02bc 814d0008 <7d201829> 2c09
40c20010 7d40192d
[  318.558336] ---[ end trace a2b72248c6bfebea ]---




More info of test environment
--
# uname -rm
4.12.0-rc4 ppc64le

# lscpu
Architecture:  ppc64le
Byte Order:Little Endian
CPU(s):16
On-line CPU(s) list:   0-15
Thread(s) per core:8
Core(s) per socket:1
Socket(s): 2
NUMA node(s):  2
Model: 2.1 (pvr 004b 0201)
Model name:POWER8 (architected), altivec supported
Hypervisor vendor: pHyp
Virtualization type:   para
L1d cache: 64K
L1i cache: 32K
NUMA node0 CPU(s): 0-15
NUMA node1 CPU(s):


-- 
Li Wang
liw...@redhat.com


Re: [PATCH v2] vfs: fix put_compat_statfs64() does not handle errors

2016-11-28 Thread Li Wang
sorry, ping for comments~



On 15 November 2016 at 17:19, Li Wang  wrote:
> put_compat_statfs64() does NOT return -1 and setting errno to EOVERFLOW
> when some variables(like: f_bsize) overflowed in the returned struct.
>
> The reason is that the ubuf->f_blocks is __u64 type, it couldn't be
> 4bits as the judgement in put_comat_statfs64(). Here correct the
> __u32 variables(in struct compat_statfs64) for comparison.
>
> reproducer:
> step1. mount hugetlbfs with two different pagesize on ppc64 arch.
>
> $ hugeadm --pool-pages-max 16M:0
> $ hugeadm --create-mount
> $ mount | grep -i hugetlbfs
> none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs 
> (rw,relatime,seclabel,pagesize=16777216)
> none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs 
> (rw,relatime,seclabel,pagesize=17179869184)
>
> step2. compile & run this C program.
>
> $ cat statfs64_test.c
>
>  #define _LARGEFILE64_SOURCE
>  #include 
>  #include 
>  #include 
>
>  int main()
>  {
> struct statfs64 sb;
> int err;
>
> err = syscall(SYS_statfs64, "/var/lib/hugetlbfs/pagesize-16GB", 
> sizeof(sb), &sb);
> if (err)
> return -1;
>
> printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), 
> sb.f_bsize);
>
> return 0;
>  }
>
> $ gcc -m32 statfs64_test.c
> $ ./a.out
> sizeof f_bsize = 4, f_bsize=0
>
> Signed-off-by: Li Wang 
> ---
>  fs/compat.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/fs/compat.c b/fs/compat.c
> index bd064a2..543b48c 100644
> --- a/fs/compat.c
> +++ b/fs/compat.c
> @@ -253,9 +253,9 @@ static int put_compat_statfs(struct compat_statfs __user 
> *ubuf, struct kstatfs *
>
>  static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct 
> kstatfs *kbuf)
>  {
> -   if (sizeof ubuf->f_blocks == 4) {
> -   if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
> -kbuf->f_bsize | kbuf->f_frsize) & 0xULL)
> +   if (sizeof(ubuf->f_bsize) == 4) {
> +   if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
> +    kbuf->f_frsize | kbuf->f_flags) & 0xULL)
> return -EOVERFLOW;
> /* f_files and f_ffree may be -1; it's okay
>  * to stuff that into 32 bits */
> --
> 1.8.3.1
>



-- 
Regards,
Li Wang
Email: wangli.a...@gmail.com


[PATCH v2] vfs: fix put_compat_statfs64() does not handle errors

2016-11-15 Thread Li Wang
put_compat_statfs64() does NOT return -1 and setting errno to EOVERFLOW
when some variables(like: f_bsize) overflowed in the returned struct.

The reason is that the ubuf->f_blocks is __u64 type, it couldn't be
4bits as the judgement in put_comat_statfs64(). Here correct the
__u32 variables(in struct compat_statfs64) for comparison.

reproducer:
step1. mount hugetlbfs with two different pagesize on ppc64 arch.

$ hugeadm --pool-pages-max 16M:0
$ hugeadm --create-mount
$ mount | grep -i hugetlbfs
none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs 
(rw,relatime,seclabel,pagesize=16777216)
none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs 
(rw,relatime,seclabel,pagesize=17179869184)

step2. compile & run this C program.

$ cat statfs64_test.c

 #define _LARGEFILE64_SOURCE
 #include 
 #include 
 #include 

 int main()
 {
struct statfs64 sb;
int err;

err = syscall(SYS_statfs64, "/var/lib/hugetlbfs/pagesize-16GB", 
sizeof(sb), &sb);
if (err)
return -1;

printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), 
sb.f_bsize);

return 0;
 }

$ gcc -m32 statfs64_test.c
$ ./a.out
sizeof f_bsize = 4, f_bsize=0

Signed-off-by: Li Wang 
---
 fs/compat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index bd064a2..543b48c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -253,9 +253,9 @@ static int put_compat_statfs(struct compat_statfs __user 
*ubuf, struct kstatfs *
 
 static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct 
kstatfs *kbuf)
 {
-   if (sizeof ubuf->f_blocks == 4) {
-   if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
-kbuf->f_bsize | kbuf->f_frsize) & 0xULL)
+   if (sizeof(ubuf->f_bsize) == 4) {
+   if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
+kbuf->f_frsize | kbuf->f_flags) & 0xULL)
return -EOVERFLOW;
/* f_files and f_ffree may be -1; it's okay
 * to stuff that into 32 bits */
-- 
1.8.3.1



Re: [PATCH] vfs: fix statfs64() does not handle errors

2016-11-14 Thread Li Wang
On Mon, Nov 07, 2016 at 11:03:11AM -0700, Andreas Dilger wrote:
> On Nov 7, 2016, at 3:21 AM, Li Wang  wrote:
> > 
> > statfs64() does NOT return -1 and setting errno to EOVERFLOW when some
> > variables(like: f_bsize) overflowed in the returned struct.
> > 
> > reproducer:
> > step1. mount hugetlbfs with two different pagesize on ppc64 arch.
> > 
> > $ hugeadm --pool-pages-max 16M:0
> > $ hugeadm --create-mount
> > $ mount | grep -i hugetlbfs
> > none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs 
> > (rw,relatime,seclabel,pagesize=16777216)
> > none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs 
> > (rw,relatime,seclabel,pagesize=17179869184)
> > 
> > step2. compile & run this C program.
> > 
> > $ cat statfs64_test.c
> > 
> > #define _LARGEFILE64_SOURCE
> > #include 
> > #include 
> > 
> > int main()
> > {
> > struct statfs64 sb;
> > int err;
> > 
> > err = statfs64("/var/lib/hugetlbfs/pagesize-16GB", &sb);
> > if (err)
> > return -1;
> > 
> > printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), 
> > sb.f_bsize);
> > 
> > return 0;
> > }
> > 
> > $ gcc -m32 statfs64_test.c
> > $ ./a.out
> > sizeof f_bsize = 4, f_bsize=0
> > 
> > Signed-off-by: Li Wang 
> > ---
> > 
> > Notes:
> >This is my first patch to kernel fs part, I'm not sure if
> >this one useful, but just want someone have a look.
> > 
> >thanks~
> > 
> > fs/statfs.c | 17 +
> > 1 file changed, 17 insertions(+)
> > 
> > diff --git a/fs/statfs.c b/fs/statfs.c
> > index 083dc0a..849dde95 100644
> > --- a/fs/statfs.c
> > +++ b/fs/statfs.c
> > @@ -151,6 +151,23 @@ static int do_statfs64(struct kstatfs *st, struct 
> > statfs64 __user *p)
> > if (sizeof(buf) == sizeof(*st))
> > memcpy(&buf, st, sizeof(*st));
> > else {
> > +   if (sizeof buf.f_bsize == 4) {
> 
> Linux CodingStyle says this should be used like sizeof(buf.f_bsize).

agree.

> 
> > +   if ((st->f_blocks | st->f_bfree | st->f_bavail |
> > +st->f_bsize | st->f_frsize) &
> > +   0xULL)
> > +   return -EOVERFLOW;
> 
> I'm not sure I agree with this check.  Sure, if sizeof(buf.f_bsize) == 4
> then the large st->f_bsize will overflow this field, and that is valid.

After thinking over, I feel that my fix in this patch is not right.

The reproducer.c running on ppc64 arch was build in 32bit, but it does
not call SYS_statfs64 in kernel. It calls compat_sys_statfs64 indeed.

# cat reproducer.c

#define _LARGEFILE64_SOURCE
#include 
#include 
#include 

int main()
{
struct statfs64 sb;
int err;

err = syscall(SYS_statfs64, "/var/lib/hugetlbfs/pagesize-16GB", 
sizeof(sb), &sb);
if (err)
return -1;

printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), 
sb.f_bsize);
return 0;
}

# gcc reproducer.c -m32

# stap -e 'probe kernel.function("compat_sys_statfs64") {printf ("%s",
$$parms);}' -vvv &

# ./a.out 
sizeof f_bsize = 4, f_bsize=0
# pathname=0x16c4 sz=0x58 buf=0xff8a20b0


Guess the fix should be like:

diff --git a/fs/compat.c b/fs/compat.c
index bd064a2..3d923fd 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -253,7 +253,7 @@ static int put_compat_statfs(struct compat_statfs __user 
*ubuf, struct kstatfs *
 
 static int put_compat_statfs64(struct compat_statfs64 __user *ubuf,
 struct kstatfs *kbuf)
 {
-   if (sizeof ubuf->f_blocks == 4) {
+   if (sizeof ubuf->f_bsize == 4) {
if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
 kbuf->f_bsize | kbuf->f_frsize) & 0xULL)
return -EOVERFLOW;


I will test it and send a new patch. 

Regards,
Li Wang

> 
> However, that doesn't mean that large values for f_blocks, f_bfree, f_bavail
> should return an error.  I assume you are concerned that the product of the
> large f_bsize and one of those values would overflow a 64-bit bytes value,
> but that is for userspace to worry about, since the values in the individual
> fields themselves are OK.
> 
> We're already over 100PiB Lustre filesystems (2^57 bytes) today, and I
> don't want statfs() failing prematurely because userspace feels the need
> to multiply out the blocks and blocksize into bytes, instead of shifting
> th

[PATCH] vfs: fix statfs64() does not handle errors

2016-11-07 Thread Li Wang
statfs64() does NOT return -1 and setting errno to EOVERFLOW when some
variables(like: f_bsize) overflowed in the returned struct.

reproducer:
step1. mount hugetlbfs with two different pagesize on ppc64 arch.

$ hugeadm --pool-pages-max 16M:0
$ hugeadm --create-mount
$ mount | grep -i hugetlbfs
none on /var/lib/hugetlbfs/pagesize-16MB type hugetlbfs 
(rw,relatime,seclabel,pagesize=16777216)
none on /var/lib/hugetlbfs/pagesize-16GB type hugetlbfs 
(rw,relatime,seclabel,pagesize=17179869184)

step2. compile & run this C program.

$ cat statfs64_test.c

 #define _LARGEFILE64_SOURCE
 #include 
 #include 

 int main()
 {
struct statfs64 sb;
int err;

err = statfs64("/var/lib/hugetlbfs/pagesize-16GB", &sb);
if (err)
return -1;

printf("sizeof f_bsize = %d, f_bsize=%ld\n", sizeof(sb.f_bsize), 
sb.f_bsize);

return 0;
 }

$ gcc -m32 statfs64_test.c
$ ./a.out
sizeof f_bsize = 4, f_bsize=0

Signed-off-by: Li Wang 
---

Notes:
This is my first patch to kernel fs part, I'm not sure if
this one useful, but just want someone have a look.

thanks~

 fs/statfs.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/fs/statfs.c b/fs/statfs.c
index 083dc0a..849dde95 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -151,6 +151,23 @@ static int do_statfs64(struct kstatfs *st, struct statfs64 
__user *p)
if (sizeof(buf) == sizeof(*st))
memcpy(&buf, st, sizeof(*st));
else {
+   if (sizeof buf.f_bsize == 4) {
+   if ((st->f_blocks | st->f_bfree | st->f_bavail |
+st->f_bsize | st->f_frsize) &
+   0xULL)
+   return -EOVERFLOW;
+   /*
+* f_files and f_ffree may be -1; it's okay to stuff
+* that into 32 bits
+*/
+   if (st->f_files != -1 &&
+   (st->f_files & 0xULL))
+   return -EOVERFLOW;
+   if (st->f_ffree != -1 &&
+   (st->f_ffree & 0xULL))
+   return -EOVERFLOW;
+   }
+
buf.f_type = st->f_type;
buf.f_bsize = st->f_bsize;
buf.f_blocks = st->f_blocks;
-- 
1.8.3.1



mtd: put flash block erasing into wait queue, if has any thread in queue

2014-08-14 Thread Li Wang

flash erasing maybe block writing operation.
make erasing operation sleep, when the other thread is in wait queue.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mtd: put flash block erasing into wait queue, if has any thread in queue

2014-08-14 Thread Li Wang
When erases many flash blocks, it maybe stop flash writing operation:
=
erase thread:
for(;;) {
  do_erase_oneblock() {
mutex_lock(&chip->mutex);
chip->state = FL_ERASING;
mutex_unlock(&chip->mutex);
msleep();   <--- erase wait
mutex_lock(&chip->mutex);
chip->state = FL_READY;
mutex_unlock(&chip->mutex);   <--- finish one block erasing
  }
}

write thread:
 retry:
  mutex_lock(&cfi->chips[chipnum].mutex);
  if (cfi->chips[chipnum].state != FL_READY) {
set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&cfi->chips[chipnum].wq, &wait);
mutex_unlock(&cfi->chips[chipnum].mutex);
schedule();   <--- write wait
remove_wait_queue(&cfi->chips[chipnum].wq, &wait);
goto retry;
=
Only when finishes one block erasing, writing operation just has chance to run.
But, if writing operation is put into wait queue(write wait), the mutex_unlock
(finish one block erasing) can not wake up writing operation. So, if many blocks
need erase, writing operation has no chance to run.
it will cause the following backtrace:
=
INFO: task sh:727 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
sh D 0fe76ad0 0 727 711 0x
Call Trace:
[df0cdc40] [0002] 0x2 (unreliable)
[df0cdd00] [c0008974] __switch_to+0x64/0xd8
[df0cdd10] [c043f2e4] schedule+0x218/0x408
[df0cdd60] [c04401f4] __mutex_lock_slowpath+0xd0/0x174
[df0cdda0] [c044087c] mutex_lock+0x5c/0x60
[df0cddc0] [c00ff18c] do_truncate+0x60/0xa8
[df0cde10] [c010d1d0] do_last+0x5a0/0x6d0
[df0cde40] [c010f778] do_filp_open+0x1d4/0x5e8
[df0cdf20] [c00fe0d0] do_sys_open+0x64/0x19c
[df0cdf40] [c0010d04] ret_from_syscall+0x0/0x4
--- Exception: c01 at 0xfe76ad0
LR = 0xffd3ae8
...
sh D 0fe77068 0 607 590 0x 
Call Trace: 
[dbca98e0] [c009ad4c] rcu_process_callbacks+0x38/0x4c (unreliable) 
[dbca99a0] [c0008974] __switch_to+0x64/0xd8 
[dbca99b0] [c043f2e4] schedule+0x218/0x408 
[dbca9a00] [c034bfa4] cfi_amdstd_write_words+0x364/0x480 
[dbca9a80] [c034c9b4] cfi_amdstd_write_buffers+0x8f4/0xca8 
[dbca9b10] [c03437ac] part_write+0xb0/0xe4 
[dbca9b20] [c02051f8] jffs2_flash_direct_writev+0xdc/0x140 
[dbca9b70] [c02079ac] jffs2_flash_writev+0x38c/0x4fc 
[dbca9bc0] [c01fc6ac] jffs2_write_dnode+0x140/0x5bc 
[dbca9c40] [c01fd0dc] jffs2_write_inode_range+0x288/0x514 
[dbca9cd0] [c01f5ed4] jffs2_write_end+0x190/0x37c 
[dbca9d10] [c00bf2f0] generic_file_buffered_write+0x100/0x26c 
[dbca9da0] [c00c1828] __generic_file_aio_write+0x2c0/0x4fc 
[dbca9e10] [c00c1ad4] generic_file_aio_write+0x70/0xf0 
[dbca9e40] [c0100398] do_sync_write+0xac/0x120 
[dbca9ee0] [c0101088] vfs_write+0xb4/0x184 
[dbca9f00] [c01012cc] sys_write+0x50/0x10c 
[dbca9f40] [c0010d04] ret_from_syscall+0x0/0x4 
--- Exception: c01 at 0xfe77068 
LR = 0xffd3c8c
...
flash_erase R running 0 869 32566 0x 
Call Trace: 
[dbc6dae0] [c0017ac0] kunmap_atomic+0x14/0x3c (unreliable) 
[dbc6dba0] [c0008974] __switch_to+0x64/0xd8 
[dbc6dbb0] [c043f2e4] schedule+0x218/0x408 
[dbc6dc00] [c043fbe4] schedule_timeout+0x170/0x2cc 
[dbc6dc50] [c00531f0] msleep+0x1c/0x34 
[dbc6dc60] [c034d538] do_erase_oneblock+0x7d0/0x944 
[dbc6dcd0] [c0349dfc] cfi_varsize_frob+0x1a8/0x2cc 
[dbc6dd20] [c034e4d4] cfi_amdstd_erase_varsize+0x30/0x60 
[dbc6dd30] [c0343abc] part_erase+0x80/0x104 
[dbc6dd40] [c0345c80] mtd_ioctl+0x3e0/0xc3c 
[dbc6de80] [c0111050] vfs_ioctl+0xcc/0xe4 
[dbc6dea0] [c011122c] do_vfs_ioctl+0x80/0x770 
[dbc6df10] [c01119b0] sys_ioctl+0x94/0x108 
[dbc6df40] [c0010d04] ret_from_syscall+0x0/0x4 
--- Exception: c01 at 0xff586a0 
LR = 0xff58608 
=
So, if there is any thread in wait queue, puts erasing operation into queue.
It makes writing operation have chance to run.

Signed-off-by: Li Wang 
---
 drivers/mtd/chips/cfi_cmdset_0002.c |   13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c 
b/drivers/mtd/chips/cfi_cmdset_0002.c
index 5a4bfe3..53f5774 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -2400,6 +2400,19 @@ static int __xipram do_erase_oneblock(struct map_info 
*map, struct flchip *chip,
chip->state = FL_READY;
DISABLE_VPP(map);
put_chip(map, chip, adr);
+   if (waitqueue_active(&chip->wq)) {
+   set_current_state(TASK_UNINTERRUPTIBLE);
+   add_wait_queue(&chip->wq, &wait);
+   mutex_unlock(&chip->mutex);
+   /*
+* If the other thread in queue misses to wake up erasing in
+* 3ms, erasing will wake up itself. The way makes erasing not
+* to hang up by the error of the other thread in queue.
+*/
+   schedule_timeout(msecs_to_jiffies(3));
+   remove_wait_queue(&chip->wq, &wait);
+   re

Re: [PATCH 2/3] Add shrink_pagecache_parent

2014-01-07 Thread Li Wang

Hi,

On 01/03/2014 07:55 AM, Andrew Morton wrote:

On Mon, 30 Dec 2013 21:45:17 +0800 Li Wang  wrote:


Analogous to shrink_dcache_parent except that it collects inodes.
It is not very appropriate to be put in dcache.c, but d_walk can only
be invoked from here.


Please cc Dave Chinner on future revisions.  He be da man.

The overall intent of the patchset seems reasonable and I agree that it
can't be efficiently done from userspace with the current kernel API.
We *could* do it from userspace by providing facilities for userspace to
query the VFS caches: "is this pathname in the dentry cache" and "is
this inode in the inode cache".


Even we have these available, i am afraid it will still introduce
non-negligible overhead due to frequent system calls for a directory
 walking operation, especially under massive small file situations.


--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1318,6 +1318,42 @@ void shrink_dcache_parent(struct dentry *parent)
  }
  EXPORT_SYMBOL(shrink_dcache_parent);

+static enum d_walk_ret gather_inode(void *data, struct dentry *dentry)
+{
+   struct list_head *list = data;
+   struct inode *inode = dentry->d_inode;
+
+   if ((inode == NULL) || ((!inode_owner_or_capable(inode)) &&
+   (!capable(CAP_SYS_ADMIN
+   goto out;
+   spin_lock(&inode->i_lock);
+   if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||


It's unclear what rationale lies behind this particular group of tests.


+   (inode->i_mapping->nrpages == 0) ||
+   (!list_empty(&inode->i_lru))) {


arg, the "Inode locking rules" at the top of fs/inode.c needs a
refresh, I suspect.  It is too vague.

Formally, inode->i_lru is protected by
i_sb->s_inode_lru->node[nid].lock, not by ->i_lock.  I guess you can
just do a list_lru_add() and that will atomically add the inode to your
local list_lru if ->i_lru wasn't being used for anything else.

I *think* that your use of i_lock works OK, because code which fiddles
with i_lru and s_inode_lru also takes i_lock.  However we need to
decide which is the preferred and official lock.  ie: what is the
design here??

However...  most inodes will be on an LRU list, won't they?  Doesn't
this reuse of i_lru mean that many inodes will fail to be processed?
If so, we might need to add a new list_head to the inode, which will be
problematic.


As far as I know, fix me if i am wrong, only when inode has zero
reference count, it will be put into superblock lru list. For most
situations, there is at least a dentry refers to it, so it will not
be on any lru list.



Aside: inode_lru_isolate() fiddles directly with inode->i_lru without
taking i_sb->s_inode_lru->node[nid].lock.  Why doesn't this make a
concurrent s_inode_lru walker go oops??  Should we be using
list_lru_del() in there?  (which should have been called
list_lru_del_init(), sigh).


It seems inode_lru_isolate() only called by prune_icache_sb() as
a callback function. Before calling it, the caller has hold
the lock.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/3] Fadvise: Directory level page cache cleaning support

2014-01-02 Thread Li Wang

Do we really need clean dcache/icache at the current stage?
That will introduce more code work, so far, iput() will put
those unreferenced inodes into superblock lru list. To free
the inodes inside a specific directory, it seems we do not
have a handy API to use, and need
modify iput() to recognize our situation, and collect those
inodes into our list rather than superblock lru list. Maybe
we stay at current stage now, since it is simple and could
gain the major benefits, leave the dcache/icache cleaning
to do in the future?

On 2013/12/31 5:33, Dave Hansen wrote:

On 12/30/2013 11:40 AM, Andreas Dilger wrote:

On Dec 30, 2013, at 12:18, Dave Hansen  wrote:

Why is this necessary to do in the kernel?  Why not leave it to
userspace to walk the filesystem(s)?


I would suspect that trying to do it in userspace would be quite bad. It would require traversing 
the whole directory tree to issue cache flushed for each subdirectory, but it doesn't know when to 
stop traversal. That would mean the "cache flush" would turn into "cache 
pollute" and cause a lot of disk IO for subdirectories not in cache to begin with.


That makes sense for dentries at least and is a pretty good reason.
Probably good enough to to include some text in the patch description.
;)  Perhaps: "We need this interface because we have no way of
determining what is in the dcache from userspace, and we do not want
userspace to pollute the dcache going and looking for page cache to evict."

One other thing that bothers me: POSIX_FADV_DONTNEED on a directory
seems like it should do something with the _directory_.  It should undo
the kernel's caching that happens as a result of readdir().

Should this also be trying to drop the dentry/inode entries like "echo 2

.../drop_caches" does?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/3] Fadvise: Directory level page cache cleaning support

2013-12-30 Thread Li Wang
VFS relies on LRU-like page cache eviction algorithm
to reclaim cache space, such general and simple algorithm 
is good regarding its application independence, and is working 
for normal situations. However, sometimes it does not help much
for those applications which are performance sensitive or under 
heavy loads. Since LRU may incorrectly evict going-to-be referenced 
pages out, resulting in severe performance degradation due to 
cache thrashing. Applications have the most knowledge
about the things they are doing, they can always do better if
they are given a chance. This motivates to endow the applications 
more abilities to manipulate the page cache.

Currently, Linux support file system wide cache cleaing by virtue of
proc interface 'drop-caches', but it is very coarse granularity and
was originally proposed for debugging. The other is to do file-level
page cache cleaning through 'fadvise', however, this is sometimes less 
flexible and not easy to use especially in directory wide operations or 
under massive small-file situations.

This patch extends 'fadvise' to support directory level page cache
cleaning. The call to posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED) 
with 'fd' referring to a directory will recursively reclaim page cache 
entries of files inside 'fd'. For secruity concern, those inodes
which the caller does not own appropriate permissions will not 
be manipulated.

It is easy to demonstrate the advantages of directory level page 
cache cleaning. We use a machine with a Pentium(R) Dual-Core CPU 
E5800 @ 3.20GHz, and with 2GB memory. Two directories named '1' 
and '3' are created, with each containing X (360 - 460) files, 
and each file with a size of 2MB. The test scripts are as follows,

The test scripts (without cache cleaning)
#!/bin/bash
cp -r 1 2
sync
cp -r 3 4
sync
time grep "data" 1/*

The time on 'grep "data" 1/*' is measured
with/without cache cleaning, under different file counts.
With cache cleaning, we clean all cache entries of files
in '2' before doing 'cp -r 3 4' by using pretty much
the following two statements,
fd = open("2", O_DIRECTORY, 0644);
posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);

The results are as follows (in seconds), 
X: Number of files inside each directory

 X   Without Cleaning With Cleaning
360  2.3851.361
380  3.1591.466
400  3.9721.558
420  4.8231.548
440  5.7981.702
460  6.8882.197

The page cache is not large enough to buffer all the four
directories, so 'cp -r 3 4' will result in some
entries of '1' to be evicted (due to LRU). When re-accessing '1',
some entries need be reloaded from disk, which is time-consuming.
In this case, cleaning '2' before 'cp -r 3 4' enjoys a good
speedup. 
 
Li Wang (3):
  VFS: Add the declaration of shrink_pagecache_parent
  Add shrink_pagecache_parent
  Fadvise: Add the ability for directory level page cache cleaning

 fs/dcache.c|   36 
 include/linux/dcache.h |1 +
 mm/fadvise.c   |4 
 3 files changed, 41 insertions(+)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] Fadvise: Add the ability for directory level page cache cleaning

2013-12-30 Thread Li Wang

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 mm/fadvise.c |4 
 1 file changed, 4 insertions(+)

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81..644d32d 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -113,6 +113,10 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, 
loff_t, len, int, advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
+   if (S_ISDIR(file_inode(f.file)->i_mode)) {
+   shrink_pagecache_parent(f.file->f_dentry);
+   goto out;
+   }
if (!bdi_write_congested(mapping->backing_dev_info))
__filemap_fdatawrite_range(mapping, offset, endbyte,
   WB_SYNC_NONE);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] VFS: Add the declaration of shrink_pagecache_parent

2013-12-30 Thread Li Wang

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 include/linux/dcache.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index bf72e9a..6262171 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -249,6 +249,7 @@ extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
+extern void shrink_pagecache_parent(struct dentry *);
 extern void shrink_dcache_for_umount(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] Add shrink_pagecache_parent

2013-12-30 Thread Li Wang
Analogous to shrink_dcache_parent except that it collects inodes.
It is not very appropriate to be put in dcache.c, but d_walk can only
be invoked from here.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/dcache.c |   36 
 1 file changed, 36 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 6055d61..0fc0f80 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1318,6 +1318,42 @@ void shrink_dcache_parent(struct dentry *parent)
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
+static enum d_walk_ret gather_inode(void *data, struct dentry *dentry)
+{
+   struct list_head *list = data;
+   struct inode *inode = dentry->d_inode;
+
+   if ((inode == NULL) || ((!inode_owner_or_capable(inode)) &&
+   (!capable(CAP_SYS_ADMIN
+   goto out;
+   spin_lock(&inode->i_lock);
+   if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+   (inode->i_mapping->nrpages == 0) ||
+   (!list_empty(&inode->i_lru))) {
+   goto out_unlock;
+   }
+   __iget(inode);
+   list_add_tail(&inode->i_lru, list);
+out_unlock:
+   spin_unlock(&inode->i_lock);
+out:
+   return D_WALK_CONTINUE;
+}
+
+void shrink_pagecache_parent(struct dentry *parent)
+{
+   LIST_HEAD(list);
+   struct inode *inode, *next;
+
+   d_walk(parent, &list, gather_inode, NULL);
+   list_for_each_entry_safe(inode, next, &list, i_lru) {
+   list_del_init(&inode->i_lru);
+   invalidate_mapping_pages(inode->i_mapping, 0, -1);
+   iput(inode);
+   }
+}
+EXPORT_SYMBOL(shrink_pagecache_parent);
+
 static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
 {
struct select_data *data = _data;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/3] Ceph fscache: Fix kernel panic due to a race

2013-12-27 Thread Li Wang

Hi Milosz,
  As far as I know, logically, currently fscache does not play
as write cache for Ceph, except that there is a
call to ceph_readpage_to_fscache() in ceph_writepage(), but that
is nothing related to our test case. According to our observation,
our test case never goes through ceph_writepage(), instead, it goes
through ceph_writepages(). So in other words, I donot think this
is related to caching in write path.
  May I try to explain the panic in more detail,

(1) dd if=/dev/zero of=cephfs/foo bs=8 count=512
(2) echo 3 > /proc/sys/vm/drop_caches
(3) dd if=cephfs/foo of=/dev/null bs=8 count=1024

For statement (1), it is frequently appending a file, so
ceph_aio_write() frequently updates the inode->i_size,
however, these updates did not immediately reflected to
object->store_limit_l. For statement (3), when we
start reading the second page at [4096, 8192), ceph find that the page
does not be cached in fscache, then it decides to write this page into
fscache, during this process in cachefiles_write_page(), it found that 
object->store_limit_l < 4096 (page->index << 12), it causes panic. Does

it make sense?

Cheers,
Li Wang

On 2013/12/27 6:51, Milosz Tanski wrote:

Li,

I looked at the patchset am I correct that this only happens when we
enable caching in the write path?

- Milosz

On Thu, Dec 26, 2013 at 9:29 AM, Li Wang  wrote:

From: Yunchuan Wen 

The following scripts could easily panic the kernel,

#!/bin/bash
mount -t ceph -o fsc MONADDR:/ cephfs
rm -rf cephfs/foo
dd if=/dev/zero of=cephfs/foo bs=8 count=512
echo 3 > /proc/sys/vm/drop_caches
dd if=cephfs/foo of=/dev/null bs=8 count=1024

This is due to when writing a page into fscache, the code will
assert that the write position does not exceed the
object->store_limit_l, which is supposed to be equal to inode->i_size.
However, for current implementation, after file writing, the
object->store_limit_l is not synchronized with new
inode->i_size immediately, which introduces a race that if writing
a new page into fscache, will reach the ASSERT that write position
has exceeded the object->store_limit_l, and cause kernel panic.
This patch fixes it.

Yunchuan Wen (3):
   Ceph fscache: Add an interface to synchronize object store limit
   Ceph fscache: Update object store limit after writing
   Ceph fscache: Wait for completion of object initialization

  fs/ceph/cache.c |1 +
  fs/ceph/cache.h |   10 ++
  fs/ceph/file.c  |3 +++
  3 files changed, 14 insertions(+)

--
1.7.9.5






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] Ceph fscache: Add an interface to synchronize object store limit

2013-12-26 Thread Li Wang
From: Yunchuan Wen 

Add an interface to explicitly synchronize object->store_limit[_l]
with inode->i_size

Signed-off-by: Yunchuan Wen 
Signed-off-by: Min Chen 
Signed-off-by: Li Wang 
---
 fs/ceph/cache.h |   10 ++
 1 file changed, 10 insertions(+)

diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba94940..262106b 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct 
page *page);
 void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
 void ceph_queue_revalidate(struct inode *inode);
 
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   fscache_attr_changed(ci->fscache);
+}
+
 static inline void ceph_fscache_invalidate(struct inode *inode)
 {
fscache_invalidate(ceph_inode(inode)->fscache);
@@ -127,6 +133,10 @@ static inline void ceph_readpage_to_fscache(struct inode 
*inode,
 {
 }
 
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
+
 static inline void ceph_fscache_invalidate(struct inode *inode)
 {
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] Ceph fscache: Update object store limit after file writing

2013-12-26 Thread Li Wang
From: Yunchuan Wen 

Synchronize object->store_limit[_l] with new inode->i_size after file writing.

Signed-off-by: Yunchuan Wen 
Signed-off-by: Min Chen 
Signed-off-by: Li Wang 
---
 fs/ceph/file.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3de8982..b6df7ab 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -786,6 +786,7 @@ retry_snap:
goto retry_snap;
}
} else {
+   loff_t old_size = inode->i_size;
/*
 * No need to acquire the i_truncate_mutex. Because
 * the MDS revokes Fwb caps before sending truncate
@@ -796,6 +797,8 @@ retry_snap:
written = generic_file_buffered_write(iocb, iov, nr_segs,
  pos, &iocb->ki_pos,
  count, 0);
+   if (inode->i_size > old_size)
+   ceph_fscache_update_objectsize(inode);
mutex_unlock(&inode->i_mutex);
}
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/3] Ceph fscache: Fix kernel panic due to a race

2013-12-26 Thread Li Wang
From: Yunchuan Wen 

The following scripts could easily panic the kernel,

#!/bin/bash
mount -t ceph -o fsc MONADDR:/ cephfs
rm -rf cephfs/foo
dd if=/dev/zero of=cephfs/foo bs=8 count=512
echo 3 > /proc/sys/vm/drop_caches
dd if=cephfs/foo of=/dev/null bs=8 count=1024

This is due to when writing a page into fscache, the code will
assert that the write position does not exceed the 
object->store_limit_l, which is supposed to be equal to inode->i_size.
However, for current implementation, after file writing, the 
object->store_limit_l is not synchronized with new 
inode->i_size immediately, which introduces a race that if writing
a new page into fscache, will reach the ASSERT that write position
has exceeded the object->store_limit_l, and cause kernel panic. 
This patch fixes it.

Yunchuan Wen (3):
  Ceph fscache: Add an interface to synchronize object store limit
  Ceph fscache: Update object store limit after writing
  Ceph fscache: Wait for completion of object initialization

 fs/ceph/cache.c |1 +
 fs/ceph/cache.h |   10 ++
 fs/ceph/file.c  |3 +++
 3 files changed, 14 insertions(+)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] Ceph fscache: Wait for completion of object initialization

2013-12-26 Thread Li Wang
From: Yunchuan Wen 

The object store limit needs to be updated after writing,
and this can be done provided the corresponding object has already
been initialized. Current object initialization is done asynchrously,
which introduce a race if a file is opened, then immediately followed
by a writing, the initialization may have not completed, the code will
reach the ASSERT in fscache_submit_exclusive_op() to cause kernel
bug.

Signed-off-by: Yunchuan Wen 
Signed-off-by: Min Chen 
Signed-off-by: Li Wang 
---
 fs/ceph/cache.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 8c44fdd..834f9f3 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct 
ceph_fs_client* fsc,
ci->fscache = fscache_acquire_cookie(fsc->fscache,
 &ceph_fscache_inode_object_def,
 ci, true);
+   fscache_check_consistency(ci->fscache);
 done:
mutex_unlock(&inode->i_mutex);
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] ceph fscache: Introduce a routine for uncaching single no data page from fscache

2013-12-19 Thread Li Wang

Signed-off-by: Li Wang 
---
 fs/ceph/cache.h |   13 +
 1 file changed, 13 insertions(+)

diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba94940..da95f61 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page 
*page, gfp_t gfp)
return fscache_maybe_release_page(ci->fscache, page, gfp);
 }
 
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+   struct page *page)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+   __fscache_uncache_page(ci->fscache, page);
+}
+
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
 struct list_head *pages)
 {
@@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page 
*page, gfp_t gfp)
return 1;
 }
 
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+   struct page *page)
+{
+}
+
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
 struct list_head *pages)
 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2] ceph fscache: uncaching single no data page when error

2013-12-19 Thread Li Wang
Currently, if one new page allocated into fscache in readpage(), however, 
with no data read into due to error encountered during reading from OSDs, 
the slot in fscache is not uncached. This patch fixes this.

Li Wang (2):
  ceph: Introduce a routine for uncaching single no data page from
fscache
  ceph: Uncaching no data page from fscache in readpage()

 fs/ceph/addr.c  |1 +
 fs/ceph/cache.h |   13 +
 2 files changed, 14 insertions(+)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] ceph fscache: Uncaching no data page from fscache in readpage()

2013-12-19 Thread Li Wang
Currently, if one new page allocated into fscache in readpage(), however, 
with no data read into due to error encountered during reading from OSDs, 
the slot in fscache is not uncached. This patch fixes this.

Signed-off-by: Li Wang 
---
 fs/ceph/addr.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ec3ba43..0cc9749 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page 
*page)
err = 0;
if (err < 0) {
SetPageError(page);
+   ceph_fscache_readpage_cancel(inode, page);
goto out;
} else {
if (err < PAGE_CACHE_SIZE) {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] VFS: Directory level cache cleaning

2013-12-17 Thread Li Wang

Both 'drop_caches' and 'vfs_cache_pressure' do coarse granularity
control. Sometimes these do not help much for those performance
sensitive applications. General and simple algorithms are good
regarding its application independence and working for normal
situations. However, since applications have the most knowledge
about the things they are doing, they can always do better if
they are given a chance. I think that is why compiler have
directives, such as __inline__,__align__, cpu cache provides
__prefetch__ etc. Similarly, I think we had better endow the
applications more abilities to manipulate the metadata/page cache.
This is potentially beneficial to avoid performance degradation
due to cache thrashing.

'drop_caches' may not be the expected way to go, since its intention
is for debugging. 'fadvise' is originally proposed at this purpose,
I think we may start with making 'fadvise' could handle directory level
page cache cleaning.

On 2013/12/18 6:05, Dave Chinner wrote:

On Mon, Dec 16, 2013 at 07:00:04AM -0800, Li Wang wrote:

Currently, Linux only support file system wide VFS
cache (dentry cache and page cache) cleaning through
'/proc/sys/vm/drop_caches'. Sometimes this is less
flexible. The applications may know exactly whether
the metadata and data will be referenced or not in future,
a desirable mechanism is to enable applications to
reclaim the memory of unused cache entries at a finer
granularity - directory level. This enables applications
to keep hot metadata and data (to be referenced in the
future) in the cache, and kick unused out to avoid
cache thrashing. Another advantage is it is more flexible
for debugging.

This patch extend the 'drop_caches' interface to
support directory level cache cleaning and has a complete
backward compatibility. '{1,2,3}' keeps the same semantics
as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed
to recursively clean the caches under DIRECTORY_PATH_NAME.
For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches'
will clean the page caches of the files inside 'home/foo/jpg'.

It is easy to demonstrate the advantage of directory level
cache cleaning. We use a virtual machine configured with
an Intel(R) Xeon(R) 8-core CPU E5506 @ 2.13GHz, and with 1GB
memory.  Three directories named '1', '2' and '3' are created,
with each containing 18 – 28 files. The test program
opens all files in a directory and then tries the next directory.
The order for accessing the directories is '1', '2', '3',
'1'.

The time on accessing '1' on the second time is measured
with/without cache cleaning, under different file counts.
With cache cleaning, we clean all cache entries of files
in '2' before accessing the files in '3'. The results
are as follows (in seconds),


This sounds like a highly contrived test case. There is no reason
why dentry cache access time would change going from 180k to 280k
files in 3 directories unless you're right at the memory pressure
balance point in terms of cache sizing.


Note: by default, VFS will move those unreferenced inodes
into a global LRU list rather than freeing them, for this
experiment, we modified iput() to force to free inode as well,
this behavior and related codes are left for further discussion,
thus not reflected in this patch)

Number of files:   18 20 22 24 26
Without cleaning:  2.165  6.977  10.032 11.571 13.443
With cleaning: 1.949  1.906  2.336  2.918  3.651

When the number of files is 18 in each directory,
the metadata cache is large enough to buffer all entries
of three directories, so re-accessing '1' will hit in
the cache, regardless of whether '2' cleaned up or not.
As the number of files increases, the cache can now only
buffer two+ directories. Accessing '3' will result in some
entries of '1' to be evicted (due to LRU). When re-accessing '1',
some entries need be reloaded from disk, which is time-consuming.


Ok, so exactly as I thought - your example working set is slightly
larger than what the cache holds. Hence what you are describing is
a cache reclaim threshold effect: something you can avoid with
/proc/sys/vm/vfs_cache_pressure.

Cheers,

Dave.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] VFS: Directory level cache cleaning

2013-12-17 Thread Li Wang

This extension is just add-on extension. The original debugging
capability is still there, and more flexible debugging is now allowed.

On 2013/12/17 17:12, Li Zefan wrote:

On 2013/12/17 15:23, Li Wang wrote:

If we do wanna equip fadvise() with directory level page cache cleaning,
this could be solved by invoking (inode_permission() || capable(CAP_SYS_ADMIN)) 
before manipulating the page cache of that inode.
We think the current extension to 'drop_caches' has a complete back
compatibility, the old semantics keep unchanged, and with add-on
features to do finer granularity cache cleaning should be also
desirable.



I don't think you can extend the drop_caches interface this way. It should
be used for debuging only.

commit 9d0243bca345d5ce25d3f4b74b7facb3a6df1232
Author: Andrew Morton 
Date:   Sun Jan 8 01:00:39 2006 -0800

 [PATCH] drop-pagecache

 Add /proc/sys/vm/drop_caches.  When written to, this will cause the kernel 
to
 discard as much pagecache and/or reclaimable slab objects as it can.  THis
 operation requires root permissions.

 ...

 This is a debugging feature: useful for getting consistent results between
 filesystem benchmarks.  We could possibly put it under a config option, but
 it's less than 300 bytes.

Also see http://lkml.org/lkml/2013/7/26/230


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] VFS: Directory level cache cleaning

2013-12-16 Thread Li Wang

If we do wanna equip fadvise() with directory level page cache cleaning,
this could be solved by invoking (inode_permission() || 
capable(CAP_SYS_ADMIN)) before manipulating the page cache of that inode.

We think the current extension to 'drop_caches' has a complete back
compatibility, the old semantics keep unchanged, and with add-on
features to do finer granularity cache cleaning should be also
desirable.

On 2013/12/17 11:58, Matthew Wilcox wrote:

On Tue, Dec 17, 2013 at 11:08:16AM +0800, Li Wang wrote:

As far as we know, fadvise(DONTNEED) does not support metadata
cache cleaning. We think that is desirable under massive small files
situations. Another thing is that do people accept the behavior
of feeding a directory fd to fadvise will recusively clean all
page caches of files inside that directory?


I think there's a really good permissions-related question here.
If that's an acceptable interface, should one have to be CAP_SYS_ADMIN
to issue the request?  What if some of the files below this directory
are not owned by the user issuing the request?


On 2013/12/17 1:45, Cong Wang wrote:

On Mon, Dec 16, 2013 at 7:00 AM, Li Wang  wrote:

This patch extend the 'drop_caches' interface to
support directory level cache cleaning and has a complete
backward compatibility. '{1,2,3}' keeps the same semantics
as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed
to recursively clean the caches under DIRECTORY_PATH_NAME.
For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches'
will clean the page caches of the files inside 'home/foo/jpg'.



This interface is ugly...

And we already have a file-level drop cache, that is,
fadvise(DONTNEED). Can you extend it if it can't
handle a directory fd?


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] VFS: Directory level cache cleaning

2013-12-16 Thread Li Wang

As far as we know, fadvise(DONTNEED) does not support metadata
cache cleaning. We think that is desirable under massive small files
situations. Another thing is that do people accept the behavior
of feeding a directory fd to fadvise will recusively clean all
page caches of files inside that directory?

On 2013/12/17 1:45, Cong Wang wrote:

On Mon, Dec 16, 2013 at 7:00 AM, Li Wang  wrote:

This patch extend the 'drop_caches' interface to
support directory level cache cleaning and has a complete
backward compatibility. '{1,2,3}' keeps the same semantics
as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed
to recursively clean the caches under DIRECTORY_PATH_NAME.
For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches'
will clean the page caches of the files inside 'home/foo/jpg'.



This interface is ugly...

And we already have a file-level drop cache, that is,
fadvise(DONTNEED). Can you extend it if it can't
handle a directory fd?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] VFS: Extend drop_caches sysctl handler to allow directory level cache cleaning

2013-12-16 Thread Li Wang

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/drop_caches.c |   45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 9fd702f..ab31393 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,10 +8,11 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 
 /* A global variable is a bit ugly, but it keeps the code simple */
-int sysctl_drop_caches;
+char sysctl_drop_caches[PATH_MAX];
 
 static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
@@ -54,15 +55,43 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
 {
int ret;
+   int command;
+   struct path path;
+   struct path root;
 
-   ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-   if (ret)
-   return ret;
-   if (write) {
-   if (sysctl_drop_caches & 1)
+   ret = proc_dostring(table, write, buffer, length, ppos);
+   if (ret || !write)
+   goto out;
+   ret = -EINVAL;
+   command = sysctl_drop_caches[0] - '0';
+   if (command < 1 || command > 3)
+   goto out;
+   if (sysctl_drop_caches[1] == '\0') {
+   if (command & 1)
iterate_supers(drop_pagecache_sb, NULL);
-   if (sysctl_drop_caches & 2)
+   if (command & 2)
drop_slab();
+   ret = 0;
+   goto out;
}
-   return 0;
+   if (sysctl_drop_caches[1] != ':' || sysctl_drop_caches[2] == '\0')
+   goto out;
+   if (sysctl_drop_caches[2] == '/')
+   get_fs_root(current->fs, &root);
+   else
+   get_fs_pwd(current->fs, &root);
+   ret = vfs_path_lookup(root.dentry, root.mnt,
+   &sysctl_drop_caches[2], 0, &path);
+   path_put(&root);
+   if (ret)
+   goto out;
+   if (command & 1)
+   shrink_pagecache_parent(path.dentry);
+   if (command & 2)
+   shrink_dcache_parent(path.dentry);
+   path_put(&path);
+out:
+   if (ret)
+   memset(sysctl_drop_caches, 0, PATH_MAX);
+   return ret;
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] VFS: Add the declaration of shrink_pagecache_parent

2013-12-16 Thread Li Wang

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 include/linux/dcache.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 57e87e7..ce11098 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -247,6 +247,7 @@ extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
+extern void shrink_pagecache_parent(struct dentry *);
 extern void shrink_dcache_for_umount(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/5] VFS: Convert drop_caches to accept string

2013-12-16 Thread Li Wang

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 kernel/sysctl.c |6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 34a6047..2f2d8ab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1255,12 +1255,10 @@ static struct ctl_table vm_table[] = {
},
{
.procname   = "drop_caches",
-   .data   = &sysctl_drop_caches,
-   .maxlen = sizeof(int),
+   .data   = sysctl_drop_caches,
+   .maxlen = PATH_MAX,
.mode   = 0644,
.proc_handler   = drop_caches_sysctl_handler,
-   .extra1 = &one,
-   .extra2 = &three,
},
 #ifdef CONFIG_COMPACTION
{
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/5] VFS: Convert sysctl_drop_caches to string

2013-12-16 Thread Li Wang

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 include/linux/mm.h |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1cedd00..5e3cc5b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mempolicy;
 struct anon_vma;
@@ -1920,7 +1921,7 @@ int in_gate_area_no_mm(unsigned long addr);
 #endif /* __HAVE_ARCH_GATE_AREA */
 
 #ifdef CONFIG_SYSCTL
-extern int sysctl_drop_caches;
+extern char sysctl_drop_caches[PATH_MAX];
 int drop_caches_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
 #endif
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/5] VFS: Add shrink_pagecache_parent

2013-12-16 Thread Li Wang
Analogous to shrink_dcache_parent except that it collects inodes.
It is not very appropriate to be put in dcache.c, but d_walk can only
be invoked from here.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/dcache.c |   35 +++
 1 file changed, 35 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 4bdb300..bcbfd0d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1318,6 +1318,41 @@ void shrink_dcache_parent(struct dentry *parent)
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
+static enum d_walk_ret gather_inode(void *data, struct dentry *dentry)
+{
+   struct list_head *list = data;
+   struct inode *inode = dentry->d_inode;
+
+   if (inode == NULL)
+   goto out;
+   spin_lock(&inode->i_lock);
+   if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+   (inode->i_mapping->nrpages == 0) ||
+   (!list_empty(&inode->i_lru))) {
+   goto out_unlock;
+   }
+   __iget(inode);
+   list_add_tail(&inode->i_lru, list);
+out_unlock:
+   spin_unlock(&inode->i_lock);
+out:
+   return D_WALK_CONTINUE;
+}
+
+void shrink_pagecache_parent(struct dentry *parent)
+{
+   LIST_HEAD(list);
+   struct inode *inode, *next;
+
+   d_walk(parent, &list, gather_inode, NULL);
+   list_for_each_entry_safe(inode, next, &list, i_lru) {
+   list_del_init(&inode->i_lru);
+   invalidate_mapping_pages(inode->i_mapping, 0, -1);
+   iput(inode);
+   }
+}
+EXPORT_SYMBOL(shrink_pagecache_parent);
+
 static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
 {
struct select_data *data = _data;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/5] VFS: Directory level cache cleaning

2013-12-16 Thread Li Wang
Currently, Linux only support file system wide VFS
cache (dentry cache and page cache) cleaning through
'/proc/sys/vm/drop_caches'. Sometimes this is less
flexible. The applications may know exactly whether
the metadata and data will be referenced or not in future,
a desirable mechanism is to enable applications to
reclaim the memory of unused cache entries at a finer
granularity - directory level. This enables applications
to keep hot metadata and data (to be referenced in the
future) in the cache, and kick unused out to avoid
cache thrashing. Another advantage is it is more flexible
for debugging.

This patch extend the 'drop_caches' interface to
support directory level cache cleaning and has a complete
backward compatibility. '{1,2,3}' keeps the same semantics
as before. Besides, "{1,2,3}:DIRECTORY_PATH_NAME" is allowed
to recursively clean the caches under DIRECTORY_PATH_NAME.
For example, 'echo 1:/home/foo/jpg > /proc/sys/vm/drop_caches'
will clean the page caches of the files inside 'home/foo/jpg'.

It is easy to demonstrate the advantage of directory level
cache cleaning. We use a virtual machine configured with
an Intel(R) Xeon(R) 8-core CPU E5506 @ 2.13GHz, and with 1GB
memory.  Three directories named '1', '2' and '3' are created,
with each containing 18 – 28 files. The test program
opens all files in a directory and then tries the next directory.
The order for accessing the directories is '1', '2', '3',
'1'.

The time on accessing '1' on the second time is measured
with/without cache cleaning, under different file counts.
With cache cleaning, we clean all cache entries of files
in '2' before accessing the files in '3'. The results
are as follows (in seconds),

Note: by default, VFS will move those unreferenced inodes
into a global LRU list rather than freeing them, for this
experiment, we modified iput() to force to free inode as well,
this behavior and related codes are left for further discussion,
thus not reflected in this patch)

Number of files:   18 20 22 24 26
Without cleaning:  2.165  6.977  10.032 11.571 13.443
With cleaning: 1.949  1.906  2.336  2.918  3.651

When the number of files is 18 in each directory,
the metadata cache is large enough to buffer all entries
of three directories, so re-accessing '1' will hit in
the cache, regardless of whether '2' cleaned up or not.
As the number of files increases, the cache can now only
buffer two+ directories. Accessing '3' will result in some
entries of '1' to be evicted (due to LRU). When re-accessing '1',
some entries need be reloaded from disk, which is time-consuming.
In this case, cleaning '2' before accessing '3' enjoys a good
speedup, a maximum 4.29X performance improvements is achieved.
The advantage of directory level page cache cleaning should be 
easier to be demonstrated.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 

Li Wang (5):
  VFS: Convert drop_caches to accept string
  VFS: Convert sysctl_drop_caches to string
  VFS: Add the declaration of shrink_pagecache_parent
  VFS: Add shrink_pagecache_parent
  VFS: Extend drop_caches sysctl handler to allow directory level cache
cleaning

 fs/dcache.c|   35 +++
 fs/drop_caches.c   |   45 +
 include/linux/dcache.h |1 +
 include/linux/mm.h |3 ++-
 kernel/sysctl.c|6 ++
 5 files changed, 77 insertions(+), 13 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] ceph: Clean up if error occurred in finish_read()

2013-11-27 Thread Li Wang
Clean up if error occurred rather than going through normal process

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/ceph/addr.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1e561c0..97845b4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -252,6 +252,8 @@ static void finish_read(struct ceph_osd_request *req, 
struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
 
+   if (rc < 0)
+   goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
int s = bytes < 0 ? 0 : bytes;
@@ -262,6 +264,7 @@ static void finish_read(struct ceph_osd_request *req, 
struct ceph_msg *msg)
flush_dcache_page(page);
SetPageUptodate(page);
ceph_readpage_to_fscache(inode, page);
+unlock:
unlock_page(page);
page_cache_release(page);
bytes -= PAGE_CACHE_SIZE;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2] ceph: Add clean up if invalid osd reply received

2013-11-27 Thread Li Wang
Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 

Li Wang (2):
  ceph: Clean up if error occurred in finish_read()
  ceph: Add necessary clean up if invalid reply received in
handle_reply()

 fs/ceph/addr.c|3 +++
 net/ceph/osd_client.c |7 +++
 2 files changed, 10 insertions(+)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] ceph: Add necessary clean up if invalid reply received in handle_reply()

2013-11-27 Thread Li Wang
Wake up possible waiters, invoke the call back if any, unregister the request

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 net/ceph/osd_client.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2b4b32a..a17eaae 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1581,6 +1581,13 @@ done:
return;
 
 bad_put:
+   req->r_result = -EIO;
+   __unregister_request(osdc, req);
+   if (req->r_callback)
+   req->r_callback(req, msg);
+   else
+   complete_all(&req->r_completion);
+   complete_request(req);
ceph_osdc_put_request(req);
 bad_mutex:
mutex_unlock(&osdc->request_mutex);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Ceph: Avoid data inconsistency due to d-cache aliasing in readpage()

2013-11-13 Thread Li Wang

Hi Yan,
  zero_user_segment() has invoked flush_dcache_page() for us, we donnot 
wanna flush d-cache twice.


Cheers,
Li Wang

On 11/13/2013 09:19 PM, Yan, Zheng wrote:

On Wed, Nov 13, 2013 at 3:22 PM, Li Wang  wrote:

If the length of data to be read in readpage() is exactly
PAGE_CACHE_SIZE, the original code does not flush d-cache
for data consistency after finishing reading. This patches fixes
this.

Signed-off-by: Li Wang 
---
  fs/ceph/addr.c |8 ++--
  1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd4..7ba 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -210,9 +210,13 @@ static int readpage_nounlock(struct file *filp, struct 
page *page)
 if (err < 0) {
 SetPageError(page);
 goto out;
-   } else if (err < PAGE_CACHE_SIZE) {
+   } else {
+   if (err < PAGE_CACHE_SIZE) {
 /* zero fill remainder of page */
-   zero_user_segment(page, err, PAGE_CACHE_SIZE);
+   zero_user_segment(page, err, PAGE_CACHE_SIZE);
+   } else {
+   flush_dcache_page(page);
+   }


this doesn't make sense for me. why not call flush_dcache_page unconditionally?

Regards
Yan, Zheng

 }
 SetPageUptodate(page);

--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] Ceph: Avoid data inconsistency due to d-cache aliasing in readpage()

2013-11-12 Thread Li Wang
If the length of data to be read in readpage() is exactly
PAGE_CACHE_SIZE, the original code does not flush d-cache
for data consistency after finishing reading. This patches fixes
this.

Signed-off-by: Li Wang 
---
 fs/ceph/addr.c |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd4..7ba 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -210,9 +210,13 @@ static int readpage_nounlock(struct file *filp, struct 
page *page)
if (err < 0) {
SetPageError(page);
goto out;
-   } else if (err < PAGE_CACHE_SIZE) {
+   } else {
+   if (err < PAGE_CACHE_SIZE) {
/* zero fill remainder of page */
-   zero_user_segment(page, err, PAGE_CACHE_SIZE);
+   zero_user_segment(page, err, PAGE_CACHE_SIZE);
+   } else {
+   flush_dcache_page(page);
+   }
}
SetPageUptodate(page);
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 7/7] Cifs: Uncaching no-data page in readpage()

2013-11-11 Thread Li Wang
Currently, if one page allocated into fscache in readpage(), however, with
no-data read, it is not uncached. This patch fixes this.

Signed-off-by: Li Wang 
---
 fs/cifs/file.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7f2..153bc58 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3406,8 +3406,10 @@ static int cifs_readpage_worker(struct file *file, 
struct page *page,
 
rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset);
 
-   if (rc < 0)
+   if (rc < 0) {
+   cifs_fscache_readpage_cancel(file_inode(file), page);
goto io_error;
+   }
else
cifs_dbg(FYI, "Bytes read %d\n", rc);
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/7] Ceph: Uncaching no-data page in readpage()

2013-11-11 Thread Li Wang
Currently, if one page allocated into fscache in readpage(), however, with
no-data read, it is not uncached. This patch fixes this.

Signed-off-by: Li Wang 
---
 fs/ceph/addr.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd4..be5f4b6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page 
*page)
err = 0;
if (err < 0) {
SetPageError(page);
+   ceph_fscache_readpage_cancel(inode, page);
goto out;
} else if (err < PAGE_CACHE_SIZE) {
/* zero fill remainder of page */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/7] Ceph: Introduce routine for uncaching single no-data page

2013-11-11 Thread Li Wang
Introduce a routine for uncaching single no-data page, typically
in readpage().

Signed-off-by: Li Wang 
---
 fs/ceph/cache.h |   13 +
 1 file changed, 13 insertions(+)

diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba94940..eb0ec76 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page 
*page, gfp_t gfp)
return fscache_maybe_release_page(ci->fscache, page, gfp);
 }
 
+
+static inline void cpeh_fscache_readpage_cancel(struct inode *inode,
+   struct page *page)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   return fscache_readpage_cancel(ci->fscache, page);
+}
+
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
 struct list_head *pages)
 {
@@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page 
*page, gfp_t gfp)
return 1;
 }
 
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+   struct page *page)
+{
+}
+
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
 struct list_head *pages)
 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/7] Cifs: Implement uncaching single no-data page

2013-11-11 Thread Li Wang
Implement the routine for uncaching single no-data page, typically
in readpage().

Signed-off-by: Li Wang 
---
 fs/cifs/fscache.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 8d4b7bc..168f184 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -223,6 +223,13 @@ void __cifs_readpage_to_fscache(struct inode *inode, 
struct page *page)
fscache_uncache_page(CIFS_I(inode)->fscache, page);
 }
 
+void __cifs_fscache_readpage_cancel(struct inode *inode, struct page *page)
+{
+cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n",
+ __func__, CIFS_I(inode)->fscache, inode);
+fscache_readpage_cancel(CIFS_I(inode)->fscache, page);
+}
+
 void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head 
*pages)
 {
cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n",
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/7] Fscache: Introduce new API fscache_readpage_cancel()

2013-11-11 Thread Li Wang
Introduce a new API fscache_readpage_cancel() for uncaching one single
no-data page from fscache.

Signed-off-by: Li Wang 
---
 include/linux/fscache.h |   11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 115bb81..f1ed21f 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -245,6 +245,8 @@ extern bool __fscache_maybe_release_page(struct 
fscache_cookie *, struct page *,
 gfp_t);
 extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *,
  struct inode *);
+extern void __fscache_readpage_cancel(struct fscache_cookie *cookie,
+ struct page *page);
 extern void __fscache_readpages_cancel(struct fscache_cookie *cookie,
   struct list_head *pages);
 extern void __fscache_disable_cookie(struct fscache_cookie *, bool);
@@ -633,6 +635,15 @@ int fscache_alloc_page(struct fscache_cookie *cookie,
return -ENOBUFS;
 }
 
+static inline
+void fscache_readpage_cancel(struct fscache_cookie *cookie,
+struct page *page)
+{
+   if (fscache_cookie_valid(cookie))
+   __fscache_readpage_cancel(cookie, page);
+}
+
+
 /**
  * fscache_readpages_cancel - Cancel read/alloc on pages
  * @cookie: The cookie representing the inode's cache object.
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/7] Fscache: Implement uncaching single no-data page

2013-11-11 Thread Li Wang
Similar to the routine for multiple pages except
that it takes page * as input rather than list head *.

Signed-off-by: Li Wang 
---
 fs/fscache/page.c |8 
 1 file changed, 8 insertions(+)

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 7f5c658..0c69f72 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -721,6 +721,14 @@ nobufs:
 }
 EXPORT_SYMBOL(__fscache_alloc_page);
 
+void __fscache_readpage_cancel(struct fscache_cookie *cookie,
+  struct page *page)
+{
+   if (PageFsCache(page))
+   __fscache_uncache_page(cookie, page);
+}
+EXPORT_SYMBOL(__fscache_readpage_cancel);
+
 /*
  * Unmark pages allocate in the readahead code path (via:
  * fscache_readpages_or_alloc) after delegating to the base filesystem
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/7] Cifs and Ceph: Uncache single no-data page in readpage()

2013-11-11 Thread Li Wang
Currently, the page allocated into fscache in readpage() 
for Cifs and Ceph does not be uncached if no data read due
to io error. This patch fixes this. fscache_readpages_cancel() 
is for this kind of job but taking list read * as input, so
a new routine take page * as input is introduced. 

Li Wang (7):
  Fscache: Introduce new API fscache_readpage_cancel()
  Fscache: Implement uncaching single no-data page
  Ceph: Introduce routine for uncaching single no-data page
  Ceph: Uncaching no-data page in readpage()
  Cifs: Introduce routine for uncaching single no-data page
  Cifs: Implement uncaching single no-data page
  Cifs: Uncaching no-data page in readpage()

 fs/ceph/addr.c  |1 +
 fs/ceph/cache.h |   13 +
 fs/cifs/file.c  |4 +++-
 fs/cifs/fscache.c   |7 +++
 fs/cifs/fscache.h   |   13 +
 fs/fscache/page.c   |8 
 include/linux/fscache.h |   11 +++
 7 files changed, 56 insertions(+), 1 deletion(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/7] Cifs: Introduce routine for uncaching single no-data page

2013-11-11 Thread Li Wang
Introduce a routine for uncaching single no-data page, typically
in readpage().

Signed-off-by: Li Wang 
---
 fs/cifs/fscache.h |   13 +
 1 file changed, 13 insertions(+)

diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 24794b6..c712f42 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -54,6 +54,7 @@ extern int __cifs_readpages_from_fscache(struct inode *,
 struct address_space *,
 struct list_head *,
 unsigned *);
+extern void __cifs_fscache_readpage_cancel(struct inode *, struct page *);
 extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head 
*);
 
 extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
@@ -92,6 +93,13 @@ static inline void cifs_readpage_to_fscache(struct inode 
*inode,
__cifs_readpage_to_fscache(inode, page);
 }
 
+static inline void cifs_fscache_readpage_cancel(struct inode *inode,
+   struct page *page)
+{
+   if (CIFS_I(inode)->fscache)
+   return __cifs_fscache_readpage_cancel(inode, page);
+}
+
 static inline void cifs_fscache_readpages_cancel(struct inode *inode,
 struct list_head *pages)
 {
@@ -139,6 +147,11 @@ static inline int cifs_readpages_from_fscache(struct inode 
*inode,
 static inline void cifs_readpage_to_fscache(struct inode *inode,
struct page *page) {}
 
+static inline void cifs_fscache_readpage_cancel(struct inode *inode,
+   struct page *page)
+{
+}
+
 static inline void cifs_fscache_readpages_cancel(struct inode *inode,
 struct list_head *pages)
 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] Ceph: allocate non-zero page to fscache in readpage()

2013-11-08 Thread Li Wang
ceph_osdc_readpages() returns number of bytes read, currently,
the code only allocate full-zero page into fscache, this patch
fixes this.

Signed-off-by: Li Wang 
---
 fs/ceph/addr.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd4..1e561c0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -216,7 +216,7 @@ static int readpage_nounlock(struct file *filp, struct page 
*page)
}
SetPageUptodate(page);
 
-   if (err == 0)
+   if (err >= 0)
ceph_readpage_to_fscache(inode, page);
 
 out:
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] ceph: Write through cache support based on fscache

2013-11-02 Thread Li Wang

Hi Milosz,
  Thanks for your comments.
  We think SSD and fscache based write cache is definitely useful for 
Ceph, since to some extent, write amplification slow down the write 
performance of Ceph. Lustre has already introduced SSD based write 
cache. SSD can be treated as an outer big cache for page cache. It can 
reduce the requirement of network and OSD bandwidth. Write back cache is 
more performance useful, but more complicated to implement to meet the 
consistence and other correctness semantic demands of Ceph and POSIX, 
such as sync(). Write through cache is much simpler, which will not 
bother too much. So our goal is to implement both, we plan to submit it 
as a blueprint at the incoming CDS.
  It would be great if you could help review and give comments on our 
codes during the development. Again, thanks very much.


Cheers,
Li Wang

On 11/02/2013 12:51 AM, Milosz Tanski wrote:

Li,

I think it would be fantastic to see a write cache. In many workloads
you ended up writing out a file and then turning around and reading it
right back in on the same node.

There's a few things that I would like to see. First, an mount option
to turn on/off write through caching. There are some workloads / user
hardware configurations that will not benefit from this (it might be a
net negative). Also, I think it's nice to have a fallback to disable
it it's miss behaving.

Second, for correctness I think you should only do write-through
caching if you have an exclusive cap on the file. Currently as the
code is written it only reads from fscache if the file is open in read
only mode and has the cache cap. This would also have to change.

Thanks,
- Milosz

P.S: Sorry for the second message Li, I fail at email and forgot to reply-all.

On Fri, Nov 1, 2013 at 9:49 AM, Li Wang  wrote:

Currently, fscache only plays as read cache for ceph, this patch
enables it plays as the write through cache as well.

A small trick to be discussed: if the writing to OSD finishes before
the writing to fscache, the fscache writing is cancelled to avoid
slow down the writepages() process.

Signed-off-by: Min Chen 
Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
  fs/ceph/addr.c  |   10 +++---
  fs/ceph/cache.c |   29 +
  fs/ceph/cache.h |   13 +
  3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd4..2465c49 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -506,7 +506,7 @@ static int writepage_nounlock(struct page *page, struct 
writeback_control *wbc)
 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);

-   ceph_readpage_to_fscache(inode, page);
+   ceph_writepage_to_fscache(inode, page);

 set_page_writeback(page);
 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -634,6 +634,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
 generic_error_remove_page(inode->i_mapping, page);

+   ceph_maybe_release_fscache_page(inode, page);
 unlock_page(page);
 }
 dout("%p wrote+cleaned %d pages\n", inode, wrote);
@@ -746,7 +747,7 @@ retry:

 while (!done && index <= end) {
 int num_ops = do_sync ? 2 : 1;
-   unsigned i;
+   unsigned i, j;
 int first;
 pgoff_t next;
 int pvec_pages, locked_pages;
@@ -894,7 +895,6 @@ get_more_pages:
 if (!locked_pages)
 goto release_pvec_pages;
 if (i) {
-   int j;
 BUG_ON(!locked_pages || first < 0);

 if (pvec_pages && i == pvec_pages &&
@@ -924,6 +924,10 @@ get_more_pages:

 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
 !!pool, false);
+   for (j = 0; j < locked_pages; j++) {
+   struct page *page = pages[j];
+   ceph_writepage_to_fscache(inode, page);
+   }

 pages = NULL;   /* request message now owns the pages array */
 pool = NULL;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 6bfe65e..6f928c4 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -320,6 +320,24 @@ void ceph_readpage_to_fscache(struct inode *inode, struct 
page *page)
  fscache_uncache_page(ci->fscache, page);
  }

+void ceph_writepage_to_fscache(struct inode *inode, struct page *page)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   int ret;
+
+   if (!cache_valid(ci))
+   return;
+
+   if (!PageFsCa

[RFC PATCH] ceph: Write through cache support based on fscache

2013-11-01 Thread Li Wang
Currently, fscache only plays as read cache for ceph, this patch
enables it plays as the write through cache as well. 

A small trick to be discussed: if the writing to OSD finishes before
the writing to fscache, the fscache writing is cancelled to avoid 
slow down the writepages() process.

Signed-off-by: Min Chen 
Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/ceph/addr.c  |   10 +++---
 fs/ceph/cache.c |   29 +
 fs/ceph/cache.h |   13 +
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd4..2465c49 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -506,7 +506,7 @@ static int writepage_nounlock(struct page *page, struct 
writeback_control *wbc)
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
 
-   ceph_readpage_to_fscache(inode, page);
+   ceph_writepage_to_fscache(inode, page);
 
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -634,6 +634,7 @@ static void writepages_finish(struct ceph_osd_request *req,
if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
generic_error_remove_page(inode->i_mapping, page);
 
+   ceph_maybe_release_fscache_page(inode, page);
unlock_page(page);
}
dout("%p wrote+cleaned %d pages\n", inode, wrote);
@@ -746,7 +747,7 @@ retry:
 
while (!done && index <= end) {
int num_ops = do_sync ? 2 : 1;
-   unsigned i;
+   unsigned i, j;
int first;
pgoff_t next;
int pvec_pages, locked_pages;
@@ -894,7 +895,6 @@ get_more_pages:
if (!locked_pages)
goto release_pvec_pages;
if (i) {
-   int j;
BUG_ON(!locked_pages || first < 0);
 
if (pvec_pages && i == pvec_pages &&
@@ -924,6 +924,10 @@ get_more_pages:
 
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
!!pool, false);
+   for (j = 0; j < locked_pages; j++) {
+   struct page *page = pages[j];
+   ceph_writepage_to_fscache(inode, page);
+   }
 
pages = NULL;   /* request message now owns the pages array */
pool = NULL;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 6bfe65e..6f928c4 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -320,6 +320,24 @@ void ceph_readpage_to_fscache(struct inode *inode, struct 
page *page)
 fscache_uncache_page(ci->fscache, page);
 }
 
+void ceph_writepage_to_fscache(struct inode *inode, struct page *page)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   int ret;
+
+   if (!cache_valid(ci))
+   return;
+
+   if (!PageFsCache(page)) {
+   if (fscache_alloc_page(ci->fscache, page, GFP_KERNEL))
+   return;
+   }
+
+   if (fscache_write_page(ci->fscache, page, GFP_KERNEL))
+   fscache_uncache_page(ci->fscache, page);
+}
+
+
 void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
 {
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -328,6 +346,17 @@ void ceph_invalidate_fscache_page(struct inode* inode, 
struct page *page)
fscache_uncache_page(ci->fscache, page);
 }
 
+void ceph_maybe_release_fscache_page(struct inode *inode, struct page *page)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+
+   if (PageFsCache(page)) {
+   if (!fscache_check_page_write(ci->fscache, page))
+   fscache_maybe_release_page(ci->fscache,
+  page, GFP_KERNEL);
+   }
+}
+
 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
 {
if (fsc->revalidate_wq)
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba94940..aa02b7a 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -45,7 +45,9 @@ int ceph_readpages_from_fscache(struct inode *inode,
struct list_head *pages,
unsigned *nr_pages);
 void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
+void ceph_writepage_to_fscache(struct inode *inode, struct page *page);
 void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
+void ceph_maybe_release_fscache_page(struct inode *inode, struct page *page);
 void ceph_queue_revalidate(struct inode *inode);
 
 static inline void ceph_fscache_invalidate(struct inode *inode)
@@ -127,6 +129,11 @@ static inline void ceph_readpage_to_fscache(struct i

[PATCH] ceph: Update the pages in fscache in writepages() path

2013-10-31 Thread Li Wang
Currently, the pages in fscache only are updated in writepage() path,
add the process in writepages().

Signed-off-by: Min Chen 
Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/ceph/addr.c |8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd4..cc57911 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -746,7 +746,7 @@ retry:
 
while (!done && index <= end) {
int num_ops = do_sync ? 2 : 1;
-   unsigned i;
+   unsigned i, j;
int first;
pgoff_t next;
int pvec_pages, locked_pages;
@@ -894,7 +894,6 @@ get_more_pages:
if (!locked_pages)
goto release_pvec_pages;
if (i) {
-   int j;
BUG_ON(!locked_pages || first < 0);
 
if (pvec_pages && i == pvec_pages &&
@@ -924,7 +923,10 @@ get_more_pages:
 
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
!!pool, false);
-
+   for(j = 0; j < locked_pages; j++) {
+   struct page *page = pages[j];
+   ceph_readpage_to_fscache(inode, page);
+   }   
pages = NULL;   /* request message now owns the pages array */
pool = NULL;
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5] Ceph: Punch hole support for kernel client

2013-08-14 Thread Li Wang
This patch implements fallocate and punch hole support for Ceph kernel client.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
Against v3:

Passed the fsx test from xfstests.

Truncate rather than delete the first object. Thanks go to Sage and Zheng for 
the explanation.

Silence the OSD ENOENT complaints.
---
 fs/ceph/file.c|  196 +
 net/ceph/osd_client.c |   11 ++-
 2 files changed, 205 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2ddf061..e2bcd5c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "super.h"
 #include "mds_client.h"
@@ -871,6 +872,200 @@ out:
return offset;
 }
 
+static inline void ceph_zero_partial_page(
+   struct inode *inode, loff_t offset, unsigned size)
+{
+   struct page *page;
+   pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+   page = find_lock_page(inode->i_mapping, index);
+   if (page) {
+   wait_on_page_writeback(page);
+   zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+   unlock_page(page);
+   page_cache_release(page);
+   }
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+   loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+   if (offset < nearly) {
+   loff_t size = nearly - offset;
+   if (length < size)
+   size = length;
+   ceph_zero_partial_page(inode, offset, size);
+   offset += size;
+   length -= size;
+   }
+   if (length >= PAGE_CACHE_SIZE) {
+   loff_t size = round_down(length, PAGE_CACHE_SIZE);
+   truncate_pagecache_range(inode, offset, offset + size - 1);
+   offset += size;
+   length -= size;
+   }
+   if (length)
+   ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+   loff_t offset, loff_t *length)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+   struct ceph_osd_request *req;
+   int ret = 0;
+   loff_t zero = 0;
+   int op;
+
+   if (!length) {
+   op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+   length = &zero;
+   } else {
+   op = CEPH_OSD_OP_ZERO;
+   }
+
+   req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+   ceph_vino(inode),
+   offset, length,
+   1, op,
+   CEPH_OSD_FLAG_WRITE |
+   CEPH_OSD_FLAG_ONDISK,
+   NULL, 0, 0, false);
+   if (IS_ERR(req)) {
+   ret = PTR_ERR(req);
+   goto out;
+   }
+
+   ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+   &inode->i_mtime);
+
+   ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+   if (!ret) {
+   ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+   if (ret == -ENOENT)
+   ret = 0;
+   }
+   ceph_osdc_put_request(req);
+
+out:
+   return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+   int ret = 0;
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   __s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
+   __s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+   __s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+   loff_t object_set_size = (loff_t)object_size * stripe_count;
+
+   loff_t nearly = (offset + object_set_size - 1)
+   / object_set_size * object_set_size;
+   while (length && offset < nearly) {
+   loff_t size = length;
+   ret = ceph_zero_partial_object(inode, offset, &size);
+   if (ret < 0)
+   return ret;
+   offset += size;
+   length -= size;
+   }
+   while (length >= object_set_size) {
+   int i;
+   loff_t pos = offset;
+   for (i = 0; i < stripe_count; ++i) {
+   ret = ceph_zero_partial_object(inode, pos, NULL);
+   if (ret < 0)
+   return ret;
+   pos += stripe_unit;
+   }
+   offset += object_set_size;
+   length -= object_set_size;

[PATCH v4] Ceph: Punch hole support for kernel client

2013-08-14 Thread Li Wang
This patch implements fallocate and punch hole support for Ceph kernel client.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
Passed the fsx test from xfstests.

Truncate rather than delete the first object. Thanks go to Sage and Zheng for 
the explanation.
---
 fs/ceph/file.c|  193 +
 net/ceph/osd_client.c |   11 ++-
 2 files changed, 202 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2ddf061..04201fb 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "super.h"
 #include "mds_client.h"
@@ -871,6 +872,197 @@ out:
return offset;
 }
 
+static inline void ceph_zero_partial_page(
+   struct inode *inode, loff_t offset, unsigned size)
+{
+   struct page *page;
+   pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+   page = find_lock_page(inode->i_mapping, index);
+   if (page) {
+   wait_on_page_writeback(page);
+   zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+   unlock_page(page);
+   page_cache_release(page);
+   }
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+   loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+   if (offset < nearly) {
+   loff_t size = nearly - offset;
+   if (length < size)
+   size = length;
+   ceph_zero_partial_page(inode, offset, size);
+   offset += size;
+   length -= size;
+   }
+   if (length >= PAGE_CACHE_SIZE) {
+   loff_t size = round_down(length, PAGE_CACHE_SIZE);
+   truncate_pagecache_range(inode, offset, offset + size - 1);
+   offset += size;
+   length -= size;
+   }
+   if (length)
+   ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+   loff_t offset, loff_t *length)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+   struct ceph_osd_request *req;
+   int ret = 0;
+   loff_t zero = 0;
+   int op;
+
+   if (!length) {
+   op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+   length = &zero;
+   } else {
+   op = CEPH_OSD_OP_ZERO;
+   }
+
+   req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+   ceph_vino(inode),
+   offset, length,
+   1, op,
+   CEPH_OSD_FLAG_WRITE |
+   CEPH_OSD_FLAG_ONDISK,
+   NULL, 0, 0, false);
+   if (IS_ERR(req)) {
+   ret = PTR_ERR(req);
+   goto out;
+   }
+
+   ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+   &inode->i_mtime);
+
+   ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+   if (!ret)
+   ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+   ceph_osdc_put_request(req);
+
+out:
+   return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+   int ret = 0;
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   __s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
+   __s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+   __s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+   loff_t object_set_size = (loff_t)object_size * stripe_count;
+
+   loff_t nearly = (offset + object_set_size - 1)
+   / object_set_size * object_set_size;
+   while (length && offset < nearly) {
+   loff_t size = length;
+   ret = ceph_zero_partial_object(inode, offset, &size);
+   if (ret < 0)
+   return ret;
+   offset += size;
+   length -= size;
+   }
+   while (length >= object_set_size) {
+   int i;
+   loff_t pos = offset;
+   for (i = 0; i < stripe_count; ++i) {
+   ret = ceph_zero_partial_object(inode, pos, NULL);
+   if (ret < 0)
+   return ret;
+   pos += stripe_unit;
+   }
+   offset += object_set_size;
+   length -= object_set_size;
+   }
+   while (length) {
+   loff_t size = length;
+   ret = ceph_zero_partial

[PATCH] x86: remove redundant local_irq_enable() after cpuidle_idle_call()

2013-08-13 Thread Li Wang
When cpuidle_idle_call() return 0, it shows that linux system is using
idle framwork driver. Now, local irq has already been enabled in
cpuidle_idle_call(). So, it need not enable local irq again, when return 0.

The code is introduced by commit:
97a5b81fa4d3a11dcdf224befc577f2e0abadc0b ("x86: Fix idle consolidation fallout")
In that defect, it does not use idle framework driver, just call 
amd_e400_idle().
That problem is that amd_e400_idle() does not enable irq.

Signed-off-by: Li Wang 
---
 arch/x86/kernel/process.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 83369e5..cb55ee4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -300,8 +300,6 @@ void arch_cpu_idle(void)
 {
if (cpuidle_idle_call())
x86_idle();
-   else
-   local_irq_enable();
 }
 
 /*
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] Ceph: Punch hole support for kernel client

2013-07-22 Thread Li Wang
This patch implements fallocate and punch hole support for Ceph kernel client.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
Passed the fsx test from xfstests.
---
 fs/ceph/file.c|  191 +
 net/ceph/osd_client.c |8 ++-
 2 files changed, 197 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e169..6e56824 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "super.h"
 #include "mds_client.h"
@@ -882,6 +883,195 @@ out:
return offset;
 }
 
+static inline void ceph_zero_partial_page(
+   struct inode *inode, loff_t offset, unsigned size)
+{
+   struct page *page;
+   pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+   page = find_lock_page(inode->i_mapping, index);
+   if (page) {
+   wait_on_page_writeback(page);
+   zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+   unlock_page(page);
+   page_cache_release(page);
+   }
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+   loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+   if (offset < nearly) {
+   loff_t size = nearly - offset;
+   if (length < size)
+   size = length;
+   ceph_zero_partial_page(inode, offset, size);
+   offset += size;
+   length -= size;
+   }
+   if (length >= PAGE_CACHE_SIZE) {
+   loff_t size = round_down(length, PAGE_CACHE_SIZE);
+   truncate_pagecache_range(inode, offset, offset + size - 1);
+   offset += size;
+   length -= size;
+   }
+   if (length)
+   ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+   loff_t offset, loff_t *length)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+   struct ceph_osd_request *req;
+   int ret = 0;
+   loff_t zero = 0;
+   int op = CEPH_OSD_OP_ZERO;
+
+   if (!length) {
+   op = CEPH_OSD_OP_DELETE;
+   length = &zero;
+   }
+
+   req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+   ceph_vino(inode),
+   offset, length,
+   1, op,
+   CEPH_OSD_FLAG_WRITE |
+   CEPH_OSD_FLAG_ONDISK,
+   NULL, 0, 0, false);
+   if (IS_ERR(req)) {
+   ret = PTR_ERR(req);
+   goto out;
+   }
+
+   ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+   &inode->i_mtime);
+
+   ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+   if (!ret)
+   ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+   ceph_osdc_put_request(req);
+
+out:
+   return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+   int ret = 0;
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   __s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
+   __s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+   __s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+   loff_t object_set_size = (loff_t)object_size * stripe_count;
+
+   loff_t nearly = (offset + object_set_size - 1)
+   / object_set_size * object_set_size;
+   while (length && offset < nearly) {
+   loff_t size = length;
+   ret = ceph_zero_partial_object(inode, offset, &size);
+   if (ret < 0)
+   return ret;
+   offset += size;
+   length -= size;
+   }
+   while (length >= object_set_size) {
+   int i;
+   loff_t pos = offset;
+   for (i = 0; i < stripe_count; ++i) {
+   ret = ceph_zero_partial_object(inode, pos, NULL);
+   if (ret < 0)
+   return ret;
+   pos += stripe_unit;
+   }
+   offset += object_set_size;
+   length -= object_set_size;
+   }
+   while (length) {
+   loff_t size = length;
+   ret = ceph_zero_partial_object(inode, offset, &size);
+   if (ret < 0)
+   return ret;
+   offset += size;
+   length -= size;
+   }
+ 

[RFC] Ceph: Kernel client part of inline data support

2013-07-08 Thread Li Wang
This patch implements the kernel client part of inline data support,
the algorithm is described below.

This is a preliminarly implementation based on Linux kernel 3.8.3.

State:
CEPH_INLINE_MIGRATION: The file size has exceeded the threshold of inline, but 
MDS has the newest inline data
CEPH_INLINE_DISABLED: The file is not inlined, and MDS does not have the inline 
data

Client:
Open, lookup, getattr, handle_cap_grant etc,
  MDS send inline data together with inode metadata to client

Read side:

if (hold CEPH_CAP_FILE_CACHE capability) // ceph_readpage()/ceph_readpages()
  if (state < CEPH_INLINE_MIGRATION)
copy inline data from inode buffer into page cache 
  else 
if (state == CEPH_INLINE_MIGRATION)
  read the data from the OSD
  replace the head of the first page with the inline data from inode buffer
else // ceph_sync_read()
  if (state != CEPH_INLINE_DISABLED)
send GETATTR message to MDS to fetch inline data into inode buffer
copy the inline data from inode buffer to user buffer directly
if (state == CEPH_INLINE_MIGRATION and pos+len>CEPH_INLINE_SIZE)
  continue to read the remaning data from OSD to user buffer

Write side:

if (hold CEPH_CAP_FILE_CACHE capability) 
  if (state < CEPH_INLINE_MIGRATION) // ceph_write_end()
if (pos < CEPH_INLINE_SIZE)
  if (pos + len > CEPH_INLINE_SIZE)
let state = CEPH_INLINE_DISABLED
else
  let state = CEPH_INLINE_MIGRATION
  else if (state == CEPH_INLINE_MIGRATION)
if (pos < CEPH_INLINE_SIZE)
  let state = CEPH_INLINE_DISABLED;
 
  if (state < CEPH_INLINE_MIGRATION) // ceph_writepage/ceph_writepages_start()
copy data from page cache into inode buffer
mark cap and inode dirty to send inode buffer to MDS
  else
do the normal write to OSD
else // ceph_sync_write()
  if (state != CEPH_INLINE_DISABLED) 
if (pos < CEPH_INLINE_SIZE)
  copy the written data fit into [pos, min(pos+len, CEPH_INLINE_SIZE)) from 
user buffer directly to inode buffer
  let dirty_data_only=true, record the write pos as well as length // leave 
MDS to merge
  mark cap and inode dirty to send (maybe part of) written data to MDS
if (pos + len >= CEPH_INLINE_SIZE)
  let state = CEPH_INLINE_MIGRATION
  write the remaining data to OSD
  else
do the normal write to OSD

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/ceph/addr.c   |  186 ++
 fs/ceph/caps.c   |   61 --
 fs/ceph/file.c   |   90 +++-
 fs/ceph/inode.c  |   19 -
 fs/ceph/mds_client.c |   14 ++--
 fs/ceph/mds_client.h |2 +
 fs/ceph/super.h  |   14 
 include/linux/ceph/ceph_fs.h |4 +
 net/ceph/messenger.c |2 +-
 9 files changed, 342 insertions(+), 50 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 064d1a6..033396c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,6 +204,18 @@ static int readpage_nounlock(struct file *filp, struct 
page *page)
 
dout("readpage inode %p file %p page %p index %lu\n",
 inode, filp, page, page->index);
+
+   if (ci->i_inline_data.version < CEPH_INLINE_MIGRATION && 
ci->i_inline_data.length) {
+   void *virt = kmap(page);
+   memcpy(virt, ci->i_inline_data.data, ci->i_inline_data.length);
+   kunmap(page);
+   zero_user_segment(page, ci->i_inline_data.length, 
PAGE_CACHE_SIZE);
+   flush_dcache_page(page);
+   SetPageUptodate(page);
+   err = 0;
+   goto out;
+   }
+   
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
  (u64) page_offset(page), &len,
  ci->i_truncate_seq, ci->i_truncate_size,
@@ -217,6 +229,13 @@ static int readpage_nounlock(struct file *filp, struct 
page *page)
/* zero fill remainder of page */
zero_user_segment(page, err, PAGE_CACHE_SIZE);
}
+
+   if (ci->i_inline_data.version == CEPH_INLINE_MIGRATION && 
ci->i_inline_data.length) {
+   void *virt = kmap(page);
+   memcpy(virt, ci->i_inline_data.data, ci->i_inline_data.length);
+   kunmap(page);
+   flush_dcache_page(page);
+   }
SetPageUptodate(page);
 
 out:
@@ -252,6 +271,15 @@ static void finish_read(struct ceph_osd_request *req, 
struct ceph_msg *msg)
for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
struct page *page = req->r_pages[i];
 
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   if (ci->i_inline_data.version == CEPH_INLINE_MIGRATION && 
page->index == 0) {
+   if (ci->i_inline_data.lengt

[PATCH v2] Ceph: Punch hole support

2013-06-19 Thread Li Wang

This patch implements punch hole (fallocate) support for Ceph.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/ceph/file.c|  313 
+

 net/ceph/osd_client.c |8 +-
 2 files changed, 319 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e169..578e5fd 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "super.h"
 #include "mds_client.h"
@@ -882,6 +883,317 @@ out:
return offset;
 }

+static inline void ceph_zero_partial_page(struct inode *inode, pgoff_t 
index, unsigned start, unsigned size)

+{
+   struct page *page;
+
+   page = find_lock_page(inode->i_mapping, index);
+   if (page) {
+   zero_user(page, start, size);
+   unlock_page(page);
+   page_cache_release(page);
+   }   
+}
+
+static void ceph_truncate_and_zero_page_cache(struct inode *inode, 
loff_t offset, loff_t length)

+{
+   loff_t first_page;
+   loff_t last_page;
+   loff_t zero_len;
+
+	first_page =((offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) << 
PAGE_CACHE_SHIFT;

+   last_page = ((offset + length) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+   if (last_page > first_page) {
+   truncate_pagecache_range(inode, first_page, last_page - 1);
+   }
+   if (first_page > last_page) {
+		ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & 
(PAGE_CACHE_SIZE - 1), length);

+   return;
+   }
+   /*
+* zero out the partial page that contains
+* the start of the hole
+*/ 
+   zero_len  = first_page - offset;
+   if (zero_len > 0) {
+		ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, offset & 
(PAGE_CACHE_SIZE -1), zero_len);

+   }
+   /*
+* zero out the partial page that contains
+* the end of the hole
+*/
+   zero_len = offset + length - last_page;
+   if (zero_len > 0) {
+		ceph_zero_partial_page(inode, (offset + length) >> PAGE_CACHE_SHIFT, 
0, zero_len);

+   }
+   /*
+* If i_size is contained in the last page, we need to
+* zero the partial page after i_size
+*/
+	if (inode->i_size >> PAGE_CACHE_SHIFT == (offset + length) >> 
PAGE_CACHE_SHIFT && inode->i_size % PAGE_CACHE_SIZE != 0) {

+   zero_len = PAGE_CACHE_SIZE -
+   (inode->i_size & (PAGE_CACHE_SIZE - 1));
+   if (zero_len > 0) {
+			ceph_zero_partial_page(inode, inode->i_size >> PAGE_CACHE_SHIFT, 
inode->i_size & (PAGE_CACHE_SIZE -1), zero_len);

+   }
+   }
+}
+
+static inline __u32 ceph_calculate_shift(__s64 size)
+{
+   int shift;
+   
+   if (size <= 0)
+   return -1;
+   if (size == 1)
+   return 0;
+   for (shift = 0; ;shift++) {
+   if (2 << shift == size)
+   break;
+   }
+   shift++;
+   
+   return shift;
+}
+
+static int ceph_delete_object(struct inode *inode, u64 offset, u64 *length)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  
+   struct ceph_osd_request *req;
+   int ret = 0;
+   
+   req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ceph_vino(inode), offset, length, 1,
+CEPH_OSD_OP_DELETE, 
CEPH_OSD_FLAG_ONDISK,

+NULL,
+ci->i_truncate_seq, 
ci->i_truncate_size,

+false);
+   if (IS_ERR(req)) {
+   ret = PTR_ERR(req);
+   goto out;
+   }
+
+ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+if (!ret) {
+ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+}
+   ceph_osdc_put_request(req);
+
+   out:
+   return ret;
+}
+
+static int ceph_zero_partial_object(struct inode *inode, loff_t offset, 
loff_t *length)

+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+   struct ceph_osd_request *req;
+   int ret = 0;
+   
+   if (length <= 0)
+   goto out;
+
+   
+   req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ceph_vino(inode), offset, length, 1,
+CEPH_OSD_OP_ZERO, 
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,

+NULL,
+ci->i_truncate_seq, 
ci->i_truncate_size,

+   

[PATCH 1/2] Punch hole support against 3.8-rc3

2013-06-14 Thread Li Wang
This patch implements punch hole (fallocate) support against
Linux kernel 3.8-rc3.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/ceph/file.c|  248 +
 net/ceph/osd_client.c |   17 +++-
 2 files changed, 260 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e51558f..7fb9c6d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "super.h"
 #include "mds_client.h"
@@ -848,6 +849,252 @@ out:
return offset;
 }
 
+static inline void ceph_zero_partial_page(struct inode *inode, pgoff_t index, 
unsigned start, unsigned size)
+{
+   struct page *page;
+
+   page = find_lock_page(inode->i_mapping, index);
+   if (page) {
+   zero_user(page, start, size);
+   unlock_page(page);
+   page_cache_release(page);
+   }   
+}
+
+static void ceph_truncate_and_zero_page_cache(struct inode *inode, loff_t 
offset, loff_t length)
+{
+   loff_t first_page;
+   loff_t last_page;
+   loff_t zero_len;
+
+   first_page =((offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) << 
PAGE_CACHE_SHIFT;
+   last_page = ((offset + length) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+   if (last_page > first_page) {
+   truncate_pagecache_range(inode, first_page, last_page - 1);
+   }
+   if (first_page > last_page) {
+   ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, 
offset & (PAGE_CACHE_SIZE - 1), length);
+   return;
+   }
+   /*
+* zero out the partial page that contains
+* the start of the hole
+*/ 
+   zero_len  = first_page - offset;
+   if (zero_len > 0) {
+   ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, 
offset & (PAGE_CACHE_SIZE -1), zero_len);
+   }
+   /*
+* zero out the partial page that contains
+* the end of the hole
+*/
+   zero_len = offset + length - last_page;
+   if (zero_len > 0) {
+   ceph_zero_partial_page(inode, (offset + length) >> 
PAGE_CACHE_SHIFT, 0, zero_len);
+   }
+   /*
+* If i_size is contained in the last page, we need to
+* zero the partial page after i_size
+*/
+   if (inode->i_size >> PAGE_CACHE_SHIFT == (offset + length) >> 
PAGE_CACHE_SHIFT && inode->i_size % PAGE_CACHE_SIZE != 0) {
+   zero_len = PAGE_CACHE_SIZE -
+   (inode->i_size & (PAGE_CACHE_SIZE - 1));
+   if (zero_len > 0) {
+   ceph_zero_partial_page(inode, inode->i_size >> 
PAGE_CACHE_SHIFT, inode->i_size & (PAGE_CACHE_SIZE -1), zero_len);
+   }
+   }
+}
+
+static int ceph_delete_object_range(struct inode *inode, loff_t lstart, loff_t 
lend)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  
+   struct ceph_osd_request *req;
+   u64 length = ceph_file_layout_object_size(ci->i_layout);
+   loff_t offset;
+   int ret = 0;
+
+   if (lstart > lend || length <= 0)
+   goto out;
+   for (offset = lstart; offset <= lend; offset += length) {   
+   req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ceph_vino(inode), offset, &length,
+CEPH_OSD_OP_DELETE, CEPH_OSD_FLAG_ONDISK,
+NULL,
+0,
+ci->i_truncate_seq, ci->i_truncate_size,
+NULL, false, 1, 0);
+   if (IS_ERR(req)) {
+   ret = PTR_ERR(req);
+   goto out;
+   }
+
+   ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+   if (!ret) {
+   ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+   }
+   ceph_osdc_put_request(req); 
+   /* object deleted */
+   if (ret == -ENOENT)
+   ret = 0;
+   }
+
+   out:
+   return ret;
+}
+
+static int ceph_zero_partial_object(struct file *file, loff_t offset, loff_t 
length)
+{
+   struct ceph_file_info *fi = file->private_data; 
+   struct inode *inode = file->f_dentry->d_inode;
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   struct ceph_fs_client *fsc = ceph_inode_to_client(inode);   
+   struct ceph_osd_request *req;
+   struct timespec mtime = CURRENT_TIME;
+   int want, got = 0, ret = 0;
+   
+   if (length <= 0)
+   goto out;
+
+   
+  

[PATCH 0/2] Kernel file system client support for punch hole

2013-06-14 Thread Li Wang
This patch implements punch hole (fallocate) support for Ceph kernel
file system client.
We prepared two patches based on different kernel versions, one against
kernel 3.8-rc3, the other against the latest 3.10-rc5. It is because
unfortunately, we failed to set up a workable Ceph system with the client
based on the lastest code from Linux kernel git tree, for the server
side, we tried both the latest code from Ceph git tree and
the latest v0.61.3 release. The client will easily hang there without
any response, unless rebooting the machine.
We managed to set up a Ceph system with the client based on Linux 
kernel 3.8-rc3 and the server based on Ceph v0.61.3, so the patch 
against v3.8-rc3 has been under preliminary tests. However, the one 
against v3.10-rc5 not.
Comments are appreciated.  

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] Punch hole support against 3.10-rc5

2013-06-14 Thread Li Wang
This patch implements punch hole (fallocate) support against
Linux kernel 3.10-rc5.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
---
 fs/ceph/file.c|  245 +
 net/ceph/osd_client.c |8 +-
 2 files changed, 251 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e169..e092b69 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "super.h"
 #include "mds_client.h"
@@ -882,6 +883,249 @@ out:
return offset;
 }
 
+static inline void ceph_zero_partial_page(struct inode *inode, pgoff_t index, 
unsigned start, unsigned size)
+{
+   struct page *page;
+
+   page = find_lock_page(inode->i_mapping, index);
+   if (page) {
+   zero_user(page, start, size);
+   unlock_page(page);
+   page_cache_release(page);
+   }   
+}
+
+static void ceph_truncate_and_zero_page_cache(struct inode *inode, loff_t 
offset, loff_t length)
+{
+   loff_t first_page;
+   loff_t last_page;
+   loff_t zero_len;
+
+   first_page =((offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) << 
PAGE_CACHE_SHIFT;
+   last_page = ((offset + length) >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+   if (last_page > first_page) {
+   truncate_pagecache_range(inode, first_page, last_page - 1);
+   }
+   if (first_page > last_page) {
+   ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, 
offset & (PAGE_CACHE_SIZE - 1), length);
+   return;
+   }
+   /*
+* zero out the partial page that contains
+* the start of the hole
+*/ 
+   zero_len  = first_page - offset;
+   if (zero_len > 0) {
+   ceph_zero_partial_page(inode, offset >> PAGE_CACHE_SHIFT, 
offset & (PAGE_CACHE_SIZE -1), zero_len);
+   }
+   /*
+* zero out the partial page that contains
+* the end of the hole
+*/
+   zero_len = offset + length - last_page;
+   if (zero_len > 0) {
+   ceph_zero_partial_page(inode, (offset + length) >> 
PAGE_CACHE_SHIFT, 0, zero_len);
+   }
+   /*
+* If i_size is contained in the last page, we need to
+* zero the partial page after i_size
+*/
+   if (inode->i_size >> PAGE_CACHE_SHIFT == (offset + length) >> 
PAGE_CACHE_SHIFT && inode->i_size % PAGE_CACHE_SIZE != 0) {
+   zero_len = PAGE_CACHE_SIZE -
+   (inode->i_size & (PAGE_CACHE_SIZE - 1));
+   if (zero_len > 0) {
+   ceph_zero_partial_page(inode, inode->i_size >> 
PAGE_CACHE_SHIFT, inode->i_size & (PAGE_CACHE_SIZE -1), zero_len);
+   }
+   }
+}
+
+static int ceph_delete_object_range(struct inode *inode, loff_t lstart, loff_t 
lend)
+{
+   struct ceph_inode_info *ci = ceph_inode(inode);
+struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  
+   struct ceph_osd_request *req;
+   u64 length = ceph_file_layout_object_size(ci->i_layout);
+   loff_t offset;
+   int ret = 0;
+
+   if (lstart > lend || length <= 0)
+   goto out;
+   for (offset = lstart; offset <= lend; offset += length) {   
+   req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ceph_vino(inode), offset, &length,
+1, CEPH_OSD_OP_DELETE, 
CEPH_OSD_FLAG_ONDISK,
+NULL,
+ci->i_truncate_seq, ci->i_truncate_size,
+false);
+   if (IS_ERR(req)) {
+   ret = PTR_ERR(req);
+   goto out;
+   }
+
+   ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+   if (!ret) {
+   ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+   }
+   ceph_osdc_put_request(req); 
+   /* object deleted */
+   if (ret == -ENOENT)
+   ret = 0;
+   }
+
+   out:
+   return ret;
+}
+
+static int ceph_zero_partial_object(struct file *file, loff_t offset, loff_t 
length)
+{
+   struct ceph_file_info *fi = file->private_data; 
+   struct inode *inode = file->f_dentry->d_inode;
+   struct ceph_inode_info *ci = ceph_inode(inode);
+   struct ceph_fs_client *fsc = ceph_inode_to_client(inode);   
+   struct ceph_osd_request *req;
+   int want, got = 0, ret = 0;
+   
+   if (length <= 0)
+   goto out;
+
+   
+   if (fi->fmode & CEPH_FILE_MODE_LAZY)
+   want = CEPH_CAP_F

[PATCH v4] ext4: Avoid unnecessarily writing back dirty pages before hole punching

2013-05-27 Thread Li Wang
For hole punching, currently ext4 will synchronously write back the
dirty pages fit into the hole, since the data on the disk responding
to those pages are to be deleted, it is benefical to directly release
those pages, no matter they are dirty or not, except the ordered case.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
Cc: Dmitry Monakhov 
Reviewed-by: Zheng Liu 
Reviewed-by: Jan Kara 
---
Hi Jan,
  Did you mean this?
  It seems you donot like the jbd2_journal_begin_ordered_discard:),
However, what do you think of calling jbd2_journal_begin_ordered_punch_hole()
from jbd2_journal_begin_ordered_truncate()? In my option, 
the two guys stand at the same level. Nevertheless, 
it is up to your choice.
---
 fs/ext4/inode.c   |   27 ---
 fs/jbd2/journal.c |2 +-
 fs/jbd2/transaction.c |   29 ++---
 include/linux/jbd2.h  |   33 +++--
 4 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b8..844d1b8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3569,6 +3569,16 @@ int ext4_can_truncate(struct inode *inode)
return 0;
 }
 
+static inline int ext4_begin_ordered_punch_hole(struct inode *inode,
+  loff_t start, loff_t length)
+{
+   if (!EXT4_I(inode)->jinode)
+   return 0;
+   return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode),
+   EXT4_I(inode)->jinode,
+   start, start+length-1);
+}
+
 /*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
@@ -3602,17 +3612,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, 
loff_t length)
 
trace_ext4_punch_hole(inode, offset, length);
 
-   /*
-* Write out all dirty pages to avoid race conditions
-* Then release them.
-*/
-   if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-   ret = filemap_write_and_wait_range(mapping, offset,
-  offset + length - 1);
-   if (ret)
-   return ret;
-   }
-
mutex_lock(&inode->i_mutex);
/* It's not possible punch hole on append only file */
if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
@@ -3644,6 +3643,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, 
loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
 
+   if (ext4_should_order_data(inode)) {
+   ret = ext4_begin_ordered_punch_hole(inode, offset, length);
+   if (ret)
+   return ret;
+   }
+
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9545757..7af4e4f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit);
 EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
-EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_punch_hole);
 EXPORT_SYMBOL(jbd2_inode_cache);
 
 static void __journal_abort_soft (journal_t *journal, int errno);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c..262b1c3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2305,29 +2305,10 @@ done:
return 0;
 }
 
-/*
- * File truncate and transaction commit interact with each other in a
- * non-trivial way.  If a transaction writing data block A is
- * committing, we cannot discard the data by truncate until we have
- * written them.  Otherwise if we crashed after the transaction with
- * write has committed but before the transaction with truncate has
- * committed, we could see stale data in block A.  This function is a
- * helper to solve this problem.  It starts writeout of the truncated
- * part in case it is in the committing transaction.
- *
- * Filesystem code must call this function when inode is journaled in
- * ordered mode before truncation happens and after the inode has been
- * placed on orphan list with the new inode size. The second condition
- * avoids the race that someone writes new data and we start
- * committing the transaction after this function has been called but
- * before a transaction for truncate is started (and furthermore it
- * allows us to optimize the case where the addition to orphan list
- * happens in the same transaction as write --- we don't have to write
- * any data in such case).
- */
-int jbd2_journal_

[PATCH] ext4: Avoid unnecessarily writing back dirty pages before hole punching

2013-05-27 Thread Li Wang
For hole punching, currently ext4 will synchronously write back the
dirty pages fit into the hole, since the data on the disk responding
to those pages are to be deleted, it is benefical to directly release
those pages, no matter they are dirty or not, except the ordered case.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
Cc: Dmitry Monakhov 
Cc: Jan Kara 
---
Hi Zheng and Jan,
  Thanks for your comments.
  For data=ordered vs. data=writeback, my understanding is
that they both journal metadata, so metadata won't be corrupted
in both cases. And they both do not journal data, so data
may be lost under either case. So it is basically the same
under overwriting situation, that is, data may not be fully updated.
The difference lies in that for appending write, with data=writeback,
the commit of metadata is done asynchronously
with the write of data, so it may happen
that file size is increased, with data incompletely written,
leaves partly uninitialized data, as
pointed out by Jan, that results in security issues. For
data=ordered, metadata is committed after data are
written with slightly? lower performance, so reader won't read out
uninitialized data.
  We introduce the internal function jbd2_journal_begin_ordered_discard()
because it will be called by both jbd2_journal_begin_ordered_punch_hole()
and jbd2_journal_begin_ordered_truncate(),
and we want to leave the function prototype of jbd2_journal_begin_ordered_
truncate() unchanged, and it has less arguments than the punch hole
counterpart. The other way is we implement them independently without the
internal begin_ordered_discard() function, however, in that case,
the two functions will suffer from sharing a
big and almost the same body, that is not elegant.
  We have taken other suggestions from Jan.
---
 fs/ext4/inode.c   |   27 ---
 fs/jbd2/journal.c |2 +-
 fs/jbd2/transaction.c |   29 ++---
 include/linux/jbd2.h  |   41 +++--
 4 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b8..6b0251e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3569,6 +3569,16 @@ int ext4_can_truncate(struct inode *inode)
return 0;
 }
 
+static inline int ext4_begin_ordered_punch_hole(struct inode *inode,
+  loff_t start, loff_t length)
+{
+   if (!EXT4_I(inode)->jinode)
+   return 0;
+   return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode),
+   EXT4_I(inode)->jinode,
+   start, length);
+}
+
 /*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
@@ -3602,17 +3612,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, 
loff_t length)
 
trace_ext4_punch_hole(inode, offset, length);
 
-   /*
-* Write out all dirty pages to avoid race conditions
-* Then release them.
-*/
-   if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-   ret = filemap_write_and_wait_range(mapping, offset,
-  offset + length - 1);
-   if (ret)
-   return ret;
-   }
-
mutex_lock(&inode->i_mutex);
/* It's not possible punch hole on append only file */
if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
@@ -3644,6 +3643,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, 
loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
 
+   if (ext4_should_order_data(inode)) {
+   ret = ext4_begin_ordered_punch_hole(inode, offset, length);
+   if (ret)
+   return ret;
+   }
+
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9545757..166ca5d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit);
 EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
-EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_discard);
 EXPORT_SYMBOL(jbd2_inode_cache);
 
 static void __journal_abort_soft (journal_t *journal, int errno);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c..2d7a3bf 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2305,29 +2305,10 @@ done:
return 0;
 }
 
-/*
- * File truncate and transaction commit interact with each other in a
- * non-trivial way.  If a 

[PATCH v2] ext4: Avoid unnecessarily writing back dirty pages before hole punching

2013-05-20 Thread Li Wang
For hole punching, currently ext4 will synchronously write back the
dirty pages fit into the hole, since the data on the disk responding
to those pages are to be deleted, it is benefical to directly release
those pages, no matter they are dirty or not, except the ordered case.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
Reviewed-by: Zheng Liu 
Cc: Dmitry Monakhov 
---
Hi Zheng,
  Thanks for your comments.
  This is the revised version with the operation of writting back moved
down after the inode mutex held. But there is one thing I wanna confirm
is that whether the inode mutex could prevent the mmap() writer? I did
not take a careful look at the mmap() code, the straightforward thinking
is that mmap() write will directly dirty the pages without going through 
the VFS generic_file_write() path.
  BTW, I have one other question to confirm regarding the ext4 journal mode:
what is the advantage of data=ordered journal mode compared to data=writeback?
For overwriting write, it still may lead to the inconsistence between data and
metadata, that is, data is new and metadata is old. So its standpoint is
that it beats data=writeback in appending write?
---
 fs/ext4/inode.c   |   27 +-
 fs/jbd2/journal.c |1 +
 fs/jbd2/transaction.c |   61 +++--
 include/linux/jbd2.h  |3 +++
 4 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b8..568b0bd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3569,6 +3569,16 @@ int ext4_can_truncate(struct inode *inode)
return 0;
 }
 
+static inline int ext4_begin_ordered_fallocate(struct inode *inode,
+  loff_t start, loff_t length)
+{
+   if (!EXT4_I(inode)->jinode)
+   return 0;
+   return jbd2_journal_begin_ordered_fallocate(EXT4_JOURNAL(inode),
+   EXT4_I(inode)->jinode,
+   start, length);
+}
+
 /*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
@@ -3602,17 +3612,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, 
loff_t length)
 
trace_ext4_punch_hole(inode, offset, length);
 
-   /*
-* Write out all dirty pages to avoid race conditions
-* Then release them.
-*/
-   if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-   ret = filemap_write_and_wait_range(mapping, offset,
-  offset + length - 1);
-   if (ret)
-   return ret;
-   }
-
mutex_lock(&inode->i_mutex);
/* It's not possible punch hole on append only file */
if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
@@ -3644,6 +3643,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, 
loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
 
+   if (ext4_should_order_data(inode)) {
+   ret = ext4_begin_ordered_fallocate(inode, offset, length);
+   if (ret)
+   return ret;
+   }
+
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9545757..ccc483a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -98,6 +98,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_fallocate);
 EXPORT_SYMBOL(jbd2_inode_cache);
 
 static void __journal_abort_soft (journal_t *journal, int errno);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c..035c064 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2305,6 +2305,36 @@ done:
return 0;
 }
 
+
+static int jbd2_journal_begin_ordered_discard(journal_t *journal,
+   struct jbd2_inode *jinode,
+   loff_t start, loff_t end)
+{
+   transaction_t *inode_trans, *commit_trans;
+   int ret = 0;
+
+   /* This is a quick check to avoid locking if not necessary */
+   if (!jinode->i_transaction)
+   goto out;
+   /* Locks are here just to force reading of recent values, it is
+* enough that the transaction was not committing before we started
+* a transaction adding the inode to orphan list */
+   read_lock(&journal->j_state_lock);
+   commit_trans = journal->j_committing_transaction;
+   read_unlock(&journal->j_state_lock);
+ 

[PATCH] eCryptfs: Avoid unnecessary disk read and data decryption during writing

2012-10-29 Thread Li Wang
ecryptfs_write_begin grabs a page from page cache for writing.
If the page contains invalid data, or data older than the
counterpart on the disk, eCryptfs will read out the
corresponing data from the disk into the page, decrypt them,
then perform writing. However, for this page, if the length
of the data to be written into is equal to page size,
that means the whole page of data will be overwritten,
in which case, it does not matter whatever the data were before,
it is beneficial to perform writing directly rather than bothering
to read and decrypt first.

With this optimization, according to our test on a machine with
Intel Core 2 Duo processor, iozone 'write' operation on an existing
file with write size being multiple of page size will enjoy a steady
3x speedup.

Signed-off-by: Li Wang 
Signed-off-by: Yunchuan Wen 
Reviewed-by: Tyler Hicks 
---
 fs/ecryptfs/mmap.c |   12 ++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index bd1d57f..564a1fa 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -338,7 +338,8 @@ static int ecryptfs_write_begin(struct file *file,
if (prev_page_end_size
>= i_size_read(page->mapping->host)) {
zero_user(page, 0, PAGE_CACHE_SIZE);
-   } else {
+   SetPageUptodate(page);
+   } else if (len < PAGE_CACHE_SIZE) {
rc = ecryptfs_decrypt_page(page);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
@@ -348,8 +349,8 @@ static int ecryptfs_write_begin(struct file *file,
ClearPageUptodate(page);
goto out;
}
+   SetPageUptodate(page);
}
-   SetPageUptodate(page);
}
}
/* If creating a page or more of holes, zero them out via truncate.
@@ -499,6 +500,13 @@ static int ecryptfs_write_end(struct file *file,
}
goto out;
}
+   if (!PageUptodate(page)) {
+   if (copied < PAGE_CACHE_SIZE) {
+   rc = 0;
+   goto out;
+   }
+   SetPageUptodate(page);
+   }
/* Fills in zeros if 'to' goes beyond inode size */
rc = fill_zeros_to_end_of_page(page, to);
if (rc) {
-- 
1.7.6.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC] VFS: File System Mount Wide O_DIRECT Support

2012-09-04 Thread Li Wang
For file system created on file-backed loop device, there will be two-levels of 
page cache present, which typically doubles the memory consumption. 
In many cases, it is beneficial to turn on the O_DIRECT option while performing 
the upper file system file IO, to bypass the upper page cache, which not only 
reduces half
of the memory consumption, but also improves the performance due to shorter 
copy path.

For example, the following iozone REREAD test with O_DIRECT turned on over the 
one without
enjoys 10x speedup due to redundant cache elimination, consequently, avoiding 
page cache thrashing
on a 2GB memory machine running 3.2.9 kernel.

losetup /dev/loop0 dummy // dummy is a ext4 file with a size of 1.1GB
mkfs -t ext2 /dev/loop0
mount /dev/loop0 /dsk
cd /dsk
iozone -t 1 -s 1G -r 4M -i 0 -+n -w // produce a 1GB test file
iozone -t 1 -s 1G -r 4M -i 1 -w // REREAD test without O_DIRECT
echo 1 > /proc/sys/vm/drop_caches // cleanup the page cache
iozone -t 1 -s 1G -r 4M -i 1 -w -I // REREAD test with O_DIRECT

This feature is also expected to be useful for virtualization situation, the 
file systems inside 
the guest operation system will use much less of guest memory, which, 
potencially results in less of 
host memory use. Especially, it may be more useful if multiple guests are 
running based 
on a same disk image file.  

The idea is simple, leave the desicion for the file system user to enable file 
system mount 
wide O_DIRECT support with a new mount option, for example,

losetup /dev/loop0 dummy
mount /dev/loop0 -o MS_DIRECT /dsk

Below is the preliminary patch,

---
 fs/open.c  |5 +
 fs/super.c |2 ++
 include/linux/fs.h |1 +
 3 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index e1f2cdb..dacac30 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -958,6 +958,11 @@ long do_sys_open(int dfd, const char __user *filename, int 
flags, umode_t mode)
} else {
fsnotify_open(f);
fd_install(fd, f);
+   if (f->f_vfsmnt->mnt_sb && 
f->f_vfsmnt->mnt_sb->s_flags & MS_DIRECT) {
+   if 
(S_ISREG(f->f_dentry->d_inode->i_mode)) {
+   if (!f->f_mapping->a_ops || ((!f->f_mapping->a_ops->direct_IO) && 
(!f->f_mapping->a_ops->get_xip_mem)))
+   f->f_flags |= O_DIRECT;
+   }
}
}
putname(tmp);
diff --git a/fs/super.c b/fs/super.c
index 0902cfa..ab5c4a5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1147,6 +1147,8 @@ mount_fs(struct file_system_type *type, int flags, const 
char *name, void *data)
WARN_ON(!sb->s_bdi);
WARN_ON(sb->s_bdi == &default_backing_dev_info);
sb->s_flags |= MS_BORN;
+   if (flags & MS_DIRECT)
+   sb->s_flags |= MS_DIRECT;
 
error = security_sb_kern_mount(sb, flags, secdata);
if (error)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aa11047..127cc85 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -225,6 +225,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT   (1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION   (1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
+#define MS_DIRECT  (1<<27)
 #define MS_NOSEC   (1<<28)
 #define MS_BORN(1<<29)
 #define MS_ACTIVE  (1<<30)
-- 
1.7.6.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/