Re: kernel 4.14 how to install Mellanox connectx-3 infiniband driver?

2017-11-14 Thread Tony Yang
Thanks All,

I from
  git://git.infradead.org/nvme.git nvme-4.15 download the nvme ,then
compile with CONFIG_MLX4_INFINIBAND.Now I encountered a problem that
can not find service opensmd and openibd, how to solve, thank you

[root@cesdb01 nvme-4.15]# uname -r
4.13.0-rc7+

[root@cesdb01 ~]# pwd
/root
[root@cesdb01 ~]# cd /u01/soft/u01/soft/4.15/nvme-4.15/
[root@cesdb01 nvme-4.15]# grep -i nvme .config
CONFIG_NVME_CORE=m
CONFIG_BLK_DEV_NVME=m
CONFIG_NVME_FABRICS=m
CONFIG_NVME_RDMA=m
CONFIG_NVME_FC=m
CONFIG_NVME_TARGET=m
CONFIG_NVME_TARGET_LOOP=m
CONFIG_NVME_TARGET_RDMA=m
CONFIG_NVME_TARGET_FC=m
CONFIG_NVME_TARGET_FCLOOP=m
CONFIG_RTC_NVMEM=y
CONFIG_NVMEM=y
[root@cesdb01 nvme-4.15]# grep -i mellanox .config
CONFIG_NET_VENDOR_MELLANOX=y
[root@cesdb01 nvme-4.15]# grep -i ib .config
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
CONFIG_USELIB=y
CONFIG_BLK_DEV_BSGLIB=y
CONFIG_ARCH_HIBERNATION_HEADER=y
CONFIG_HIBERNATE_CALLBACKS=y
CONFIG_HIBERNATION=y
CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y
CONFIG_ACPI_CPPC_LIB=y
CONFIG_X86_SPEEDSTEP_LIB=m
CONFIG_HOTPLUG_PCI_ACPI_IBM=m
CONFIG_YENTA_TOSHIBA=y
CONFIG_IP_FIB_TRIE_STATS=y
# CONFIG_NFT_FIB_IPV4 is not set
# CONFIG_NFT_FIB_IPV6 is not set
CONFIG_IP_DCCP_TFRC_LIB=y
CONFIG_BT_HCIBTUSB=m
CONFIG_BT_HCIBTUSB_BCM=y
CONFIG_BT_HCIBTUSB_RTL=y
CONFIG_BT_HCIBTSDIO=m
CONFIG_BT_HCIBCM203X=m
CONFIG_BT_HCIBPA10X=m
CONFIG_BT_HCIBFUSB=m
CONFIG_FIB_RULES=y
CONFIG_LIB80211=m
# CONFIG_LIB80211_DEBUG is not set
CONFIG_CEPH_LIB=m
# CONFIG_CEPH_LIB_PRETTYDEBUG is not set
CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y
# CONFIG_IBM_ASM is not set
# CONFIG_CXL_LIB is not set
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_LIBFC=m
CONFIG_LIBFCOE=m
CONFIG_PATA_TOSHIBA=m
CONFIG_TCM_IBLOCK=m
# Distributed Switch Architecture drivers
CONFIG_CHELSIO_LIB=m
CONFIG_MLX5_CORE_IPOIB=y
CONFIG_PHYLIB=y
CONFIG_BCM_NET_PHYLIB=m
# CONFIG_LIBERTAS is not set
# CONFIG_LIBERTAS_THINFIRM is not set
CONFIG_RT2800_LIB=m
CONFIG_RT2800_LIB_MMIO=m
CONFIG_RT2X00_LIB_MMIO=m
CONFIG_RT2X00_LIB_PCI=m
CONFIG_RT2X00_LIB_USB=m
CONFIG_RT2X00_LIB=m
CONFIG_RT2X00_LIB_FIRMWARE=y
CONFIG_RT2X00_LIB_CRYPTO=y
CONFIG_RT2X00_LIB_LEDS=y
CONFIG_RT2X00_LIB_DEBUGFS=y
CONFIG_SERIO_LIBPS2=y
# CONFIG_GPIOLIB is not set
CONFIG_SENSORS_IBMAEM=m
CONFIG_SENSORS_IBMPEX=m
CONFIG_IB700_WDT=m
CONFIG_IBMASR=m
CONFIG_SSB_POSSIBLE=y
CONFIG_SSB_PCIHOST_POSSIBLE=y
CONFIG_SSB_SDIOHOST_POSSIBLE=y
CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y
CONFIG_BCMA_POSSIBLE=y
CONFIG_BCMA_HOST_PCI_POSSIBLE=y
CONFIG_DVB_USB_DIB3000MC=m
CONFIG_DVB_USB_DIBUSB_MB=m
# CONFIG_DVB_USB_DIBUSB_MB_FAULTY is not set
CONFIG_DVB_USB_DIBUSB_MC=m
CONFIG_DVB_USB_DIB0700=m
CONFIG_DVB_DIB3000MB=m
CONFIG_DVB_DIB3000MC=m
CONFIG_DVB_DIB7000M=m
CONFIG_DVB_DIB7000P=m
CONFIG_DVB_DIB8000=m
CONFIG_DVB_TUNER_DIB0070=m
CONFIG_DVB_TUNER_DIB0090=m
# CONFIG_DRM_HISI_HIBMC is not set
# CONFIG_DRM_LIB_RANDOM is not set
# CONFIG_FB_SVGALIB is not set
# CONFIG_FB_IBM_GXT4500 is not set
CONFIG_SND_OPL3_LIB=m
CONFIG_SND_OPL3_LIB_SEQ=m
# CONFIG_SND_OPL4_LIB_SEQ is not set
CONFIG_SND_VX_LIB=m
CONFIG_SND_OXYGEN_LIB=m
# CONFIG_SND_SONICVIBES is not set
CONFIG_SND_FIREWIRE_LIB=m
# CONFIG_USB_TRANCEVIBRATOR is not set
# CONFIG_MMC_TOSHIBA_PCI is not set
# CONFIG_ACCESSIBILITY is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_MAD=m
CONFIG_INFINIBAND_USER_ACCESS=m
CONFIG_INFINIBAND_USER_MEM=y
CONFIG_INFINIBAND_ON_DEMAND_PAGING=y
CONFIG_INFINIBAND_ADDR_TRANS=y
CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y
CONFIG_INFINIBAND_MTHCA=m
CONFIG_INFINIBAND_MTHCA_DEBUG=y
CONFIG_INFINIBAND_CXGB3=m
# CONFIG_INFINIBAND_CXGB3_DEBUG is not set
CONFIG_INFINIBAND_CXGB4=m
# CONFIG_INFINIBAND_I40IW is not set
CONFIG_MLX4_INFINIBAND=m
CONFIG_MLX5_INFINIBAND=m
CONFIG_INFINIBAND_NES=m
# CONFIG_INFINIBAND_NES_DEBUG is not set
CONFIG_INFINIBAND_OCRDMA=m
# CONFIG_INFINIBAND_VMWARE_PVRDMA is not set
CONFIG_INFINIBAND_USNIC=m
CONFIG_INFINIBAND_IPOIB=m
CONFIG_INFINIBAND_IPOIB_CM=y
CONFIG_INFINIBAND_IPOIB_DEBUG=y
# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set
CONFIG_INFINIBAND_SRP=m
CONFIG_INFINIBAND_SRPT=m
CONFIG_INFINIBAND_ISER=m
CONFIG_INFINIBAND_ISERT=m
# CONFIG_INFINIBAND_OPA_VNIC is not set
# CONFIG_INFINIBAND_RDMAVT is not set
# CONFIG_INFINIBAND_BNXT_RE is not set
CONFIG_RTC_LIB=y
CONFIG_RTC_MC146818_LIB=y
CONFIG_RTLLIB=m
CONFIG_RTLLIB_CRYPTO_CCMP=m
CONFIG_RTLLIB_CRYPTO_TKIP=m
CONFIG_RTLLIB_CRYPTO_WEP=m
CONFIG_TOSHIBA_BT_RFKILL=m
# CONFIG_TOSHIBA_HAPS is not set
# CONFIG_TOSHIBA_WMI is not set
# CONFIG_IBM_RTL is not set
# CONFIG_LIBNVDIMM is not set
CONFIG_ISCSI_IBFT_FIND=y
CONFIG_ISCSI_IBFT=m
# EFI (Extensible Firmware Interface) Support
CONFIG_SQUASHFS_ZLIB=y
CONFIG_PSTORE_ZLIB_COMPRESS=y
# CONFIG_SECURITY_INFINIBAND is not set
# Library routines
CONFIG_LIBCRC32C=m
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=y
CONFIG_MPILIB=y
[root@cesdb01 nvme-4.15]#

[root@cesdb01 nvme-4.15]# ifconfig -a
eno1: flags=4163  mtu 1500
inet 

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-14 Thread Khazhismel Kumykov
(Botched the to line, sorry)


smime.p7s
Description: S/MIME Cryptographic Signature


Re: [PATCH 00/12 v5] Multiqueue for MMC/SD

2017-11-14 Thread Linus Walleij
On Tue, Nov 14, 2017 at 2:30 PM, Bartlomiej Zolnierkiewicz
 wrote:
> On Tuesday, November 14, 2017 01:17:34 PM Bartlomiej Zolnierkiewicz wrote:
>
>> This works much better than initial version and a simple dd read
>> test shows more consistent results than with vanilla kernel.
>>
>> However there are still some issues:
>>
>> 1. 30 seconds delay on "Waiting for /dev to be fully populated..."
>>during boot
>>
>> 2. reboot command no longer works (there is a livelock after
>>"The system is going down for reboot NOW!" message)
>>
>> Full log (together with SysRq-l & SysRq-t outputs) attached.
>
> BTW: these problems are not present in Adrian's V13 patchset
> (with mmc-mq enabled by default).

Yes, as I even say in the cover letter I think his patches are
better so we should use those.

Bart can you provide a Tested-by for Adrian' patch set?

Yours,
Linus Walleij


Re: [PATCH V4] scsi_debugfs: fix crash in scsi_show_rq()

2017-11-14 Thread James Bottomley
On Tue, 2017-11-14 at 08:55 +0800, Ming Lei wrote:
> Hi James,
> 
> On Mon, Nov 13, 2017 at 10:55:52AM -0800, James Bottomley wrote:
> > 
> > On Sat, 2017-11-11 at 10:43 +0800, Ming Lei wrote:
> > > 
> > > So from CPU1's review, cmd->cmnd is in a remote NUMA node,
> > > __scsi_format_command() is executed much slower than
> > > mempool_free().
> > > So when mempool_free() returns, __scsi_format_command() may not
> > > fetched the buffer in L1 cache yet, then use-after-free
> > > is still triggered.
> > > 
> > > That is why I say this use-after-free is inevitable no matter
> > > 'setting SCpnt->cmnd to NULL before calling mempool_free()' or
> > > not.
> > 
> > The bottom line is that there are several creative ways around this
> > but the proposed code is currently broken and simply putting a
> > comment in saying so doesn't make it acceptable.
> 
> As I explained above, I didn't see one really workable way. Or please
> correct it if I am wrong.

I simply can't believe it's beyond the wit of man to solve a use after
free race.  About 40% of kernel techniques are devoted to this.  All I
really care about is not losing the PI information we previously had.
 I agree with Bart that NULL cmnd is a good indicator, so it seems
reasonable to use it.  If you have another mechanism, feel free to
propose it.

James



[PATCH] blkcg: correctly update stat

2017-11-14 Thread Shaohua Li
blkcg_bio_issue_check() checks throtl for stat update, which isn't good
because the bio could be splitted into small bios, the samll bios go
into blkcg_bio_issue_check again and we update stat for the small bios,
so we the stat is double charged. To fix this, we only update stat if
the bio doesn't have BIO_THROTTLED flag. The fix will update stat ahead
of bio skips from blk-throttle, but eventually the stat is correct.

This patch is on top of patch:
https://marc.info/?l=linux-block=151060860608914=2

Cc: Tejun Heo 
Signed-off-by: Shaohua Li 
---
 include/linux/blk-cgroup.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 58f3d25..6fbd9ea 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -690,7 +690,8 @@ static inline bool blkcg_bio_issue_check(struct 
request_queue *q,
 {
struct blkcg *blkcg;
struct blkcg_gq *blkg;
-   bool throtl = false;
+   bool throtl;
+   bool charged;
 
rcu_read_lock();
blkcg = bio_blkcg(bio);
@@ -707,9 +708,10 @@ static inline bool blkcg_bio_issue_check(struct 
request_queue *q,
spin_unlock_irq(q->queue_lock);
}
 
+   charged = bio_flagged(bio, BIO_THROTTLED);
throtl = blk_throtl_bio(q, blkg, bio);
 
-   if (!throtl) {
+   if (!charged) {
blkg = blkg ?: q->root_blkg;
blkg_rwstat_add(>stat_bytes, bio->bi_opf,
bio->bi_iter.bi_size);
-- 
2.9.5



Re: [PATCH 2/8] target: remove iblock WRITE_SAME passthrough support

2017-11-14 Thread Bryant G. Ly
On 4/12/17 12:30 AM, Nicholas A. Bellinger wrote:

> On Mon, 2017-04-10 at 18:08 +0200, Christoph Hellwig wrote:
>> Use the pscsi driver to support arbitrary command passthrough
>> instead.
>>
> The people who are actively using iblock_execute_write_same_direct() are
> doing so in the context of ESX VAAI BlockZero, together with
> EXTENDED_COPY and COMPARE_AND_WRITE primitives.  Just using PSCSI is not
> an option for them.
>
> In practice though I've not seen any users of IBLOCK WRITE_SAME for
> anything other than VAAI BlockZero, so just using blkdev_issue_zeroout()
> when available, and falling back to iblock_execute_write_same() if the
> WRITE_SAME buffer contains anything other than zeros should be OK.
>
> How about something like the following below..?
>
> This would bring parity to how blkdev_issue_write_same() works atm wrt
> to synchronous bio completions.  However, most folks with a raw
> make_request or blk-mq backend driver that supports multiple GB/sec of
> zero bandwidth end up changing IBLOCK to support asynchronous
> REQ_WRITE_SAME completions anyways.
>
> I'd be happy to add support for that using __blkdev_issue_zeroout() once
> the basic conversion is in place.
>
> From ff74012eaff38f9fa0d74aca60507b9964f484ce Mon Sep 17 00:00:00 2001
> From: Nicholas Bellinger 
> Date: Tue, 11 Apr 2017 22:21:47 -0700
> Subject: [PATCH] target/iblock: Convert WRITE_SAME to blkdev_issue_zeroout
>
> Signed-off-by: Nicholas Bellinger 
> ---
>  drivers/target/target_core_iblock.c | 44 
> +++--
>  1 file changed, 27 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/target/target_core_iblock.c 
> b/drivers/target/target_core_iblock.c
> index d316ed5..5bfde20 100644
> --- a/drivers/target/target_core_iblock.c
> +++ b/drivers/target/target_core_iblock.c
> @@ -86,6 +86,7 @@ static int iblock_configure_device(struct se_device *dev)
>   struct block_device *bd = NULL;
>   struct blk_integrity *bi;
>   fmode_t mode;
> + unsigned int max_write_zeroes_sectors;
>   int ret = -ENOMEM;
>
>   if (!(ib_dev->ibd_flags & IBDF_HAS_UDEV_PATH)) {
> @@ -129,7 +130,11 @@ static int iblock_configure_device(struct se_device *dev)
>* Enable write same emulation for IBLOCK and use 0x as
>* the smaller WRITE_SAME(10) only has a two-byte block count.
>*/
> - dev->dev_attrib.max_write_same_len = 0x;
> + max_write_zeroes_sectors = bdev_write_zeroes_sectors(bd);
> + if (max_write_zeroes_sectors)
> + dev->dev_attrib.max_write_same_len = max_write_zeroes_sectors;
> + else
> + dev->dev_attrib.max_write_same_len = 0x;
>
>   if (blk_queue_nonrot(q))
>   dev->dev_attrib.is_nonrot = 1;
> @@ -415,28 +420,31 @@ static void iblock_end_io_flush(struct bio *bio)
>  }
>
>  static sense_reason_t
> -iblock_execute_write_same_direct(struct block_device *bdev, struct se_cmd 
> *cmd)
> +iblock_execute_zero_out(struct block_device *bdev, struct se_cmd *cmd)
>  {
>   struct se_device *dev = cmd->se_dev;
>   struct scatterlist *sg = >t_data_sg[0];
> - struct page *page = NULL;
> - int ret;
> + unsigned char *buf, zero = 0x00, *p = 
> + int rc, ret;
>
> - if (sg->offset) {
> - page = alloc_page(GFP_KERNEL);
> - if (!page)
> - return TCM_OUT_OF_RESOURCES;
> - sg_copy_to_buffer(sg, cmd->t_data_nents, page_address(page),
> -   dev->dev_attrib.block_size);
> - }
> + buf = kmap(sg_page(sg)) + sg->offset;
> + if (!buf)
> + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
> + /*
> +  * Fall back to block_execute_write_same() slow-path if
> +  * incoming WRITE_SAME payload does not contain zeros.
> +  */
> + rc = memcmp(buf, p, cmd->data_length);
> + kunmap(sg_page(sg));
> +

I recently pulled in this patch and I am getting:

[  716.756756] [ cut here ]
[  716.756757] kernel BUG at 
/build/linux-hwe-edge-F6_Smd/linux-hwe-edge-4.13.0/lib/string.c:985!
[  716.756762] Oops: Exception in kernel mode, sig: 5 [#1]
[  716.756764] SMP NR_CPUS=2048 
[  716.756765] NUMA 
[  716.756767] pSeries
[  716.756769] Modules linked in: hvcs(OE) hvcserver dm_snapshot dm_bufio 
ip6table_raw xt_CT xt_mac xt_tcpudp xt_comment xt_physdev xt_set 
ip_set_hash_net ip_set rpadlpar_io rpaphp iptable_raw target_core_pscsi(OE) 
target_core_file(OE) target_core_iblock(OE) iscsi_target_mod(OE) dccp_diag dccp 
tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag vport_vxlan 
vxlan ip6_udp_tunnel udp_tunnel openvswitch nf_nat_ipv6 target_core_user(OE) 
uio binfmt_misc ibmvmc(OE) pseries_rng vmx_crypto crct10dif_vpmsum xt_conntrack 
nf_nat_ftp nf_conntrack_ftp nf_conntrack_netlink nfnetlink 
nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ipv6 nf_defrag_ipv6 
nbd ipt_REJECT nf_reject_ipv4 ipt_MASQUERADE 

[PATCH/RFC] blk-mq: improve heavily contended tag case

2017-11-14 Thread Jens Axboe
Even with a number of waitqueues, we can get into a situation where we
are heavily contended on the waitqueue lock. I got a report on spc1
where we're spending seconds doing this. Arguably the use case is nasty,
I reproduce it with one device and 1000 threads banging on the device.
But that doesn't mean we shouldn't be handling it better.

What ends up happening is that a thread will fail to get a tag, add
itself to the waitqueue, and subsequently get woken up when a tag is
freed - only to find itself going back to sleep on the waitqueue.

Instead of waking all threads, use an exclusive wait and wake up our
sbitmap batch count instead. This seems to work well for me (massive
improvement for this use case), and it survives basic testing. But I
haven't fully verified it yet.

An additional improvement is running the queue and checking for a new
tag BEFORE needing to add ourselves to the waitqueue.

Signed-off-by: Jens Axboe 

---

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c81b40ecd3f1..336dde07b230 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
drop_ctx = data->ctx == NULL;
do {
-   prepare_to_wait(>wait, , TASK_UNINTERRUPTIBLE);
-
-   tag = __blk_mq_get_tag(data, bt);
-   if (tag != -1)
-   break;
-
/*
 * We're out of tags on this hardware queue, kick any
 * pending IO submits before going to sleep waiting for
@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != -1)
break;
 
+   prepare_to_wait_exclusive(>wait, ,
+   TASK_UNINTERRUPTIBLE);
+
+   tag = __blk_mq_get_tag(data, bt);
+   if (tag != -1)
+   break;
+
if (data->ctx)
blk_mq_put_ctx(data->ctx);
 
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 80aa8d5463fa..42b5ca0acf93 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -462,7 +462,7 @@ static void sbq_wake_up(struct sbitmap_queue *sbq)
 */
atomic_cmpxchg(>wait_cnt, wait_cnt, wait_cnt + wake_batch);
sbq_index_atomic_inc(>wake_index);
-   wake_up(>wait);
+   wake_up_nr(>wait, wake_batch);
}
 }
 
-- 
Jens Axboe



GPT

2017-11-14 Thread peter.boettc...@gmx.net


Peter Böttcher
Hingbergstrasse 363
45472 Muelheim Ruhr
NRW
Germany

mail: peter.boettc...@gmx.net


Sorry for this english.

I wrote a partition program just for the GUID Partition Table (GPT). 5 
years ago.


The program can generate volumes from 4 to 4096  partitions (my own limit).

GPT header pos. 80 is an unsigned 32 Bit field.

The program can create partitions with the numbers 1, 10, 100, 200, 300, 
1000, 2000 etc. on the volume.


Windows, FREE/BSD Unix, Mac OS X manage a volume without problems.
They mount all partitions without problems.



So that Linux can do such a volume management, I had to patch the file 
include/linux/genhd.h up to the Linux kernel 4.13.12.

It worked fine for many years.


-

patch: include/linux/genhd.h

old: #define DISK_MAX_PARTS  256
new: #define DISK_MAX_PARTS  4096

-

In Linux Kernel 4.14.0 I have to patch the following files.

-

patch: include/linux/genhd.h

old: #define DISK_MAX_PARTS  256
new: #define DISK_MAX_PARTS  4096

-
patch: include/linux/fs.h

struct block_device {

old: u8 bd_partno;
new: u16 bd_partno;

-
patch: include/linux/blk_types.h

struct bio {

old: u8 bio_partno;
new: u16 bio_partno;

-

patch: driver/md/dm-bio-record.h

struct dm_bio_details {

  old: u8 bio_partno;
  new: u16 bio_partno;

-
patch: driver/md/dm-bio-integrity.c

struct dm_integrity_io {

old: u8 bio_partno;
new: u16 bio_partno;

-

This works well (admittedly short time).


Yours truly
Peter Böttcher









Re: [PATCH V13 03/10] mmc: block: Add blk-mq support

2017-11-14 Thread Linus Walleij
Hi Adrian!

Thanks for the helpful and productive tone in the recent mail,
it is very helpful and improves my work environment.

>> The block layer maintainers most definately think MQ is ready
>> but you seem to disagree. Why?
>
> I meant when the mmc conversion to blk-mq is ready.  We don't need to
> consider other sub-systems, just whether it is working for mmc.  For
> example, the issue with the block layer workqueues not performing as well as
> a thread.  That only came to light through testing mmc.

OK I see your point.

>> It's bad enough that you repeatedly point out how stupid
>> you think I am
>
> I have never called you names.  On the contrary, it is because I don't think
> you are stupid that I get upset when you seem spend so little time and
> effort to understand the code.  Equally, you seem to make comments that just
> assume the code is wrong without due consideration.

Please work on your patience. I think so few people are reviewing the
code because it is massive and complicated and sits in a complicated
place. One can not be blamed for trying, right?

I have spent considerable time with your code, more than with my own.
Mostly testing it, and that is why I can say it does not regress performance
on my end.

I also ran several rounds of fault injection, which it handled without
problems.

Then I tried to eject the SD card when running DD from the card and
then it crashed.

But that is an extreme test case, so overall it is very robust code.
With the exception of card removal during I/O, feel free to add
my Tested-by: Linus Walleij  on this series.

>> But what I think is nice in doing it around
>> each request is that since mmc_put_card() calls mmc_release_host()
>> contains this:
>>
>> if (--host->claim_cnt) { (...)
>>
>> So there is a counter inside the claim/release host functions
>> and now there is another counter keeping track if the in-flight
>> requests as well and it gives me the feeling we are maintaining
>> two counters when we only need one.
>
> The gets / puts also get runtime pm for the card.  It is a lot a messing
> around for the sake of a quick check for the number of requests inflight -
> which is needed anyway for CQE.

OK I buy that.

>> But maybe it is actually the host->claim_cnt that is overengineered
>> and should just be a bool, becuase with this construction
>> that you only call down and claim the host once, the
>> host->claim_cnt will only be 0 or 1, right?
>
> The claim_cnt was introduced for nested claiming.

Yeah that sounds familiar. I'm not really sure it happens
anymore after this but I guess I can test that with some
practical experiements, let's leave it like this.

>> I guess I could turn that around and ask: if it is so crappy, why
>> is your patch set not deleting it?
>
> To allow people time to test.

OK I guess I'm more forging ahead with such things. But we could
at least enable it by default so whoever checks out and builds
and tests linux-next with their MMC/SD controllers will then be
testing this for us in the next kernel cycle.

>> But I think I would prefer that if a big slew of new code is
>> introduced it needs to be put to wide use and any bugs
>> smoked out during the -rc phase, and we are now
>> hiding it behind a Kconfig option so it's unlikely to see testing
>> until that option is turned on, and that is not good.
>
> And if you find a big problem in rc7, then what do you do?  At least with
> the config option, the revert is trivial.

I see your point. I guess it is up to Ulf how he feels about
trust wrt the code. If a problem appears at -rc7 it's indeed
nice if we can leave the code in-tree and work on it.

>> At least take a moment to consider the option that maybe so few people
>> are reviwing your code because this is complicated stuff and really
>> hard to grasp, maybe the problem isn't on my side, neither on yours,
>> it may be that the subject matter is the problem.
>
> I would expect a review would take a number of days, perhaps longer.
> However, it seems people are not willing to consider anything that takes
> more than a an hour or two.

Your criticism is something like "do it properly or not at all"
and I understand that feeling. I deserve some of the criticism
and I can accept it much easier when put with these words.

It would be nice if the community would step in here and I have
seen so many people talk about this code that I really think they
should invest in proper review. More people than me and Ulf
need to help out with review and test.

>> I would say a synchronization primitive is needed, indeed.
>> I have a different approach to this, which uses completion
>> as the synchronization primitive and I think that would be possible
>> to use also here.
>
> Because they are both just wake-ups.  When waiting for multiple conditions,
> wait_event() is simpler.  You are not considering the possibility for having
> only 1 task switch between requests.

That is indeed a nice feature.

>>
>> I have 

Re: [PATCH 00/12 v5] Multiqueue for MMC/SD

2017-11-14 Thread Bartlomiej Zolnierkiewicz
On Tuesday, November 14, 2017 01:17:34 PM Bartlomiej Zolnierkiewicz wrote:

> This works much better than initial version and a simple dd read
> test shows more consistent results than with vanilla kernel.
> 
> However there are still some issues:
> 
> 1. 30 seconds delay on "Waiting for /dev to be fully populated..."
>during boot
> 
> 2. reboot command no longer works (there is a livelock after
>"The system is going down for reboot NOW!" message)
> 
> Full log (together with SysRq-l & SysRq-t outputs) attached.

BTW: these problems are not present in Adrian's V13 patchset
(with mmc-mq enabled by default).

Best regards,
--
Bartlomiej Zolnierkiewicz
Samsung R Institute Poland
Samsung Electronics



Re: [PATCH V13 03/10] mmc: block: Add blk-mq support

2017-11-14 Thread Adrian Hunter
On 09/11/17 17:52, Linus Walleij wrote:
> On Thu, Nov 9, 2017 at 11:42 AM, Adrian Hunter  
> wrote:
>> On 08/11/17 10:54, Linus Walleij wrote:
>>> On Fri, Nov 3, 2017 at 2:20 PM, Adrian Hunter  
>>> wrote:
> 
>>> At least you could do what I did and break out a helper like
>>> this:
>>>
>>> /*
>>>  * This reports status back to the block layer for a finished request.
>>>  */
>>> static void mmc_blk_complete(struct mmc_queue_req *mq_rq,
>>> blk_status_t status)
>>> {
>>>struct request *req = mmc_queue_req_to_req(mq_rq);
>>>
>>>if (req->mq_ctx) {
>>>   blk_mq_end_request(req, status);
>>>} else
>>>   blk_end_request_all(req, status);
>>> }
>>
>> You are quibbling.
> 
> Hm wiktionary "quibble" points to:
> https://en.wikipedia.org/wiki/Trivial_objections
> 
> Sorry for not being a native speaker in English, I think that
> maybe you were again trying to be snarky to a reviewer but
> I honestly don't know.
> 
>> It makes next to no difference especially as it all goes
>> away when the legacy code is deleted.
> 
> Which is something your patch set doesn't do. Nor did you write
> anywhere that deleting the legacy code was your intention.
> But I'm happy to hear it.
> 
>>  I will change it in the next
>> revision, but what a waste of everyone's time.
>>  Please try to focus on
>> things that matter.
> 
> As Jean-Paul Sartre said "hell is other people".
> Can you try to be friendlier anyway?
> 
>>> This retry and error handling using requeue is very elegant.
>>> I really like this.
>>>
>>> If we could also go for MQ-only, only this nice code
>>> remains in the tree.
>>
>> No one has ever suggested that the legacy API will remain.  Once blk-mq is
>> ready the old code gets deleted.
> 
> The block layer maintainers most definately think MQ is ready
> but you seem to disagree. Why?

I meant when the mmc conversion to blk-mq is ready.  We don't need to
consider other sub-systems, just whether it is working for mmc.  For
example, the issue with the block layer workqueues not performing as well as
a thread.  That only came to light through testing mmc.

> 
> In the recent LWN article from kernel recepies:
> https://lwn.net/Articles/735275/
> the only obstacle I can see is a mention that SCSI was not
> switched over by default because of some problems with
> slow rotating media. "This change had to be backed out
> because of some performance and scalability issues that
> arose for rotating storage."
> 
> Christoph mentions that he's gonna try again for v4.14.
> But it's true, I do not see that happening in linux-next yet.
> 
> But that is specifically rotating media, which MMC/SD is not.
> And UBI is using it since ages without complaints. And
> that is a sign of something.
> 
> Have you seen any problems when deploying it on MMC/SD,
> really? If there are problems I bet the block layer people want
> us to find them, diagnose them and ultimately fix them rather
> than wait for them to do it. I haven't seen any performance or
> scalability issues. At one point I had processes running
> on two eMMC and one SD-card and apart from maxing out
> the CPUs and DMA backplace as could be expected all was
> fine.
> 
>>> The problem: you have just reimplemented the whole error
>>> handling we had in the old block layer and now we have to
>>> maintain two copies and keep them in sync.
>>>
>>> This is not OK IMO, we will inevitable screw it up, so we
>>> need to get *one* error path.
>>
>> Wow, you really didn't read the code at all.
> 
> Just quit this snarky attitude.
> 
>> As I have repeatedly pointed
>> out, the new code is all new.  There is no overlap and there nothing to keep
>> in sync.
> 
> The problem is that you have not clearly communicated your
> vision to delete the old code. The best way to communicate that
> would be to include a patch actually deleteing it.
> 
>>  It may not look like it in this patch, but that is only because of
>> the ridiculous idea of splitting up the patch.
> 
> Naming other people's review feedback as "ridiculous" is not very
> helpful. I think the patch set became much easier to review
> like this so I am happy with this format.
> 
 +static void mmc_blk_mq_acct_req_done(struct mmc_queue *mq, struct request 
 *req)
>>>
>>> What does "acct" mean in the above function name?
>>> Accounting? Actual? I'm lost.
>>
>> Does "actual" have two "c"'s.  You are just making things up.
> 
> Please stop your snarky and belitteling attitude.
> 
> I am not a native English speaker and I am not making up
> my review comments in order to annoy you.
> 
> Consider Hanlon's razor:
> https://en.wikipedia.org/wiki/Hanlon%27s_razor
> "Never attribute to malice that which is adequately explained by
> stupidity".
> 
> It's bad enough that you repeatedly point out how stupid
> you think I am

I have never called you names.  On the contrary, it is because I don't think
you are stupid that I get upset 

Re: [PATCH 00/12 v5] Multiqueue for MMC/SD

2017-11-14 Thread Bartlomiej Zolnierkiewicz

Hi Linus,

On Friday, November 10, 2017 11:01:31 AM Linus Walleij wrote:
> This is the fifth iteration of this patch set.
> 
> I *HOPE* that we can scrap this patch set and merge Adrian's
> patches instead, because they also bring CQE support which is
> nice. I had some review comments on his series, mainly that
> it needs to kill off the legacy block layer code path that
> noone likes anyway.
> 
> So this is mainly an academic and inspirational exercise.
> Whatever remains of this refactoring, if anything, I can
> certainly do on top of Adrian's patches as well.
> 
> What changed since v4 is the error path, since Adrian pointed
> out that the error handling seems to be fragile. It was indeed
> fragile... To make sure things work properly I have run long
> test rounds with fault injection, essentially:
> 
> Enable FAULT_INJECTION, FAULT_INJECTION_DEBUG_FS,
>FAIL_MMC_REQUEST
> cd /debug/mmc3/fail_mmc_request/
> echo 1 > probability
> echo -1 > times
> 
> Then running a dd to the card, also increased the error rate
> to 10% and completed tests successfully, but at this error
> rate the MMC stack sometimes exceeds the retry limit and the
> dd command fails (as is appropriate).
> 
> Removing a card during I/O does not work well however :/
> So I guess I would need to work on that if this series should
> continue. (Hopefully unlikely.)
> 
> 
> Linus Walleij (12):
>   mmc: core: move the asynchronous post-processing
>   mmc: core: add a workqueue for completing requests
>   mmc: core: replace waitqueue with worker
>   mmc: core: do away with is_done_rcv
>   mmc: core: do away with is_new_req
>   mmc: core: kill off the context info
>   mmc: queue: simplify queue logic
>   mmc: block: shuffle retry and error handling
>   mmc: queue: stop flushing the pipeline with NULL
>   mmc: queue/block: pass around struct mmc_queue_req*s
>   mmc: block: issue requests in massive parallel
>   mmc: switch MMC/SD to use blk-mq multiqueueing v5
> 
>  drivers/mmc/core/block.c| 557 
> +++-
>  drivers/mmc/core/block.h|   5 +-
>  drivers/mmc/core/bus.c  |   1 -
>  drivers/mmc/core/core.c | 217 ++---
>  drivers/mmc/core/core.h |  11 +-
>  drivers/mmc/core/host.c |   1 -
>  drivers/mmc/core/mmc_test.c |  31 +--
>  drivers/mmc/core/queue.c| 252 
>  drivers/mmc/core/queue.h|  16 +-
>  include/linux/mmc/core.h|   3 +-
>  include/linux/mmc/host.h|  31 +--
>  11 files changed, 557 insertions(+), 568 deletions(-)

This works much better than initial version and a simple dd read
test shows more consistent results than with vanilla kernel.

However there are still some issues:

1. 30 seconds delay on "Waiting for /dev to be fully populated..."
   during boot

2. reboot command no longer works (there is a livelock after
   "The system is going down for reboot NOW!" message)

Full log (together with SysRq-l & SysRq-t outputs) attached.

Best regards,
--
Bartlomiej Zolnierkiewicz
Samsung R Institute Poland
Samsung Electronics


log.txt.gz
Description: application/gzip